Source code for dexom_python.pathway_enrichment


import argparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import fisher_exact
from matplotlib import rcParams
from statsmodels.stats.multitest import fdrcorrection
from dexom_python.model_functions import read_model, get_subsystems_from_model


[docs]def Fischer_groups(model, solpath, outpath="test"):
    """
    !!! This only works if the pathway name is stored in the model.groups property !!!
    For models where the pathways are stored in the model.reactions.subsystem property, use the Fischer_pathways function

    Performs pathway over- and underrepresentation analysis

    Parameters
    ----------
    solpath: file containing DEXOM solutions
    subframe: csv file associating reactions with subsystems
    sublist: list of subsystems
    outpath: path to which results are saved

    Returns
    -------
    over, under: pandas.DataFrames (saved as .csv files) containing -log10 BH-adjusted p-values
    """
    df = pd.read_csv(solpath, dtype=int, index_col=0)
    df.columns = [r.id for r in model.reactions]
    pvalsu = {}
    pvalso = {}

    for path in model.groups:
        tempu = []
        tempo = []
        for x in df.iterrows():
            sol = sum(x[1][r.id] for r in path.members)
            table = np.array([[sol, len(path.members) - sol],
                              [sum(x[1]) - sol, len(model.reactions) - len(path.members) - sum(x[1]) + sol]])
            o, pu = fisher_exact(table, alternative='less')
            o, po = fisher_exact(table, alternative='greater')
            tempu.append(pu)
            tempo.append(po)
        pvalsu[path.name] = tempu
        pvalso[path.name] = tempo
    over = pd.DataFrame(pvalso)
    under = pd.DataFrame(pvalsu)

    t, fdr = fdrcorrection(over.values.flatten())
    over = pd.DataFrame(-np.log10(fdr).reshape(over.shape), columns=over.columns)

    t, fdr = fdrcorrection(under.values.flatten())
    under = pd.DataFrame(-np.log10(fdr).reshape(under.shape), columns=under.columns)

    over.to_csv(outpath+"pathways_pvalues_over.csv")
    under.to_csv(outpath+"pathways_pvalues_under.csv")
    return over, under


[docs]def Fisher_pathways(solpath, subframe, sublist, outpath=""):
    """
    !!! This only works if the pathway name is stored in the model.reaction.subsystem property !!!
    For models where the pathways are stored in the model.groups property, use the new Fischer_groups function

    Performs pathway over- and underrepresentation analysis

    Parameters
    ----------
    solpath: file containing DEXOM solutions
    subframe: csv file associating reactions with subsystems
    sublist: list of subsystems
    outpath: path to which results are saved

    Returns
    -------
    over, under: pandas.DataFrames (saved as .csv files) containing -log10 BH-adjusted p-values
    """
    df = pd.read_csv(solpath, dtype=int, index_col=0)

    rxn_list = []
    rxnnumber = {}
    for sub in sublist:
        rxns = subframe.index[subframe["subsystem"] == sub].to_list()
        rxn_list.append(rxns)
        rxnnumber[sub] = len(rxns)

    sol_pathways = []
    for x in df.iterrows():
        sol_pathways.append([sum(x[1][r]) for r in rxn_list]+[sum(x[1])])

    pvalsu = {}
    pvalso = {}
    for i, sub in enumerate(sublist):
        tempu = []
        tempo = []
        for sol in sol_pathways:
            table = np.array([[sol[i], len(rxn_list[i]) - sol[i]],
                              [sol[-1] - sol[i], len(subframe) - len(rxn_list[i]) - sol[-1] + sol[i]]])
            o, pu = fisher_exact(table, alternative='less')
            o, po = fisher_exact(table, alternative='greater')
            tempu.append(pu)
            tempo.append(po)
        pvalsu[sub] = tempu
        pvalso[sub] = tempo
    over = pd.DataFrame(pvalso)
    under = pd.DataFrame(pvalsu)

    t, fdr = fdrcorrection(over.values.flatten())
    over = pd.DataFrame(-np.log10(fdr).reshape(over.shape), columns=over.columns)

    t, fdr = fdrcorrection(under.values.flatten())
    under = pd.DataFrame(-np.log10(fdr).reshape(under.shape), columns=under.columns)

    over.to_csv(outpath+"pathways_pvalues_over.csv")
    under.to_csv(outpath+"pathways_pvalues_under.csv")
    return over, under


[docs]def plot_Fisher_pathways(filename_over, filename_under, sublist, outpath="pathway_enrichment"):
    plt.ioff()
    over = pd.read_csv(filename_over, index_col=0)
    under = pd.read_csv(filename_under, index_col=0)
    over.columns = sublist
    under.columns = sublist
    over = over.sort_index(axis=1, ascending=False)
    under = under.sort_index(axis=1, ascending=False)
    plt.clf()
    fig, ax = plt.subplots(figsize=(11, 20))
    rcParams['ytick.labelsize'] = 13
    over.boxplot(vert=False, widths=0.7)
    plt.plot(list(over.values)[0], ax.get_yticks(), 'ro')

    plt.tight_layout()
    plt.subplots_adjust(top=0.95, bottom=0.05)
    plt.title("Overrepresentation analysis of active reactions per pathway", fontsize=15, loc='right', pad='20')
    plt.xlabel('-log10 adj. p-value', fontsize=15)
    plt.axvline(2.301, color="b")
    fig.savefig(outpath+"pathways_overrepresentation.png")

    plt.clf()
    fig, ax = plt.subplots(figsize=(11, 20))
    rcParams['ytick.labelsize'] = 13
    under.boxplot(vert=False, widths=0.7)
    plt.plot(list(under.values)[0], ax.get_yticks(), 'ro')
    plt.tight_layout()
    plt.subplots_adjust(top=0.95, bottom=0.05)
    plt.title("Underrepresentation analysis of active reactions per pathway", fontsize=15, loc='right', pad='20')
    plt.xlabel('-log10 adj. p-value', fontsize=15)
    plt.axvline(2.301, color="b")
    fig.savefig(outpath+"pathways_underrepresentation.png")

    return over, under


if __name__ == "__main__":
    description = "Performs pathway enrichment analysis using a hypergeometric test (Fischer exact test)"

    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("-s", "--solutions", help="csv file containing enumeration solutions")
    parser.add_argument("-m", "--model", default=None, help="Metabolic model in sbml, json, or matlab format, "
                                                            "required if subframe & sublist are absent")
    parser.add_argument("--sublist", default=None, help="List of all pathways/subsystems in the model")
    parser.add_argument("--subframe", default=None, help="csv file assigning reactions to pathways/subsystemts")
    parser.add_argument("-o", "--out_path", default="", help="Path to which the output file is written")

    args = parser.parse_args()

    groups = False
    if args.model:
        model = read_model(args.model)
        if len(model.groups) > 0:
            groups = True
            sublist = [g.name for g in model.groups]
        else:
            subframe, sublist = get_subsystems_from_model(model, save=True, out_path=args.out_path)
    else:
        subframe = pd.read_csv(args.subframe, index_col=0)
        sublist = pd.read_csv(args.sublist, sep=";").columns.to_list()

    if groups:
        Fischer_groups(model=model, solpath=args.solution, outpath=args.out_path)
    else:
        Fisher_pathways(solpath=args.solutions, subframe=subframe, sublist=sublist, outpath=args.out_path)
    plot_Fisher_pathways(filename_over=args.out_path+"pathways_pvalues_over.csv", sublist=sublist,
                         filename_under=args.out_path+"pathways_pvalues_under.csv", outpath=args.out_path)