Source code for dexom_python.pathway_enrichment

import argparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import fisher_exact
from matplotlib import rcParams
from statsmodels.stats.multitest import fdrcorrection
from dexom_python.model_functions import read_model, get_subsystems_from_model


[docs]def Fisher_groups(model, solpath, outpath='Fisher_groups'): """ !!! This only works if the pathway name is stored in the model.groups property !!! For models where the pathways are stored in the model.reactions.subsystem property, use the Fischer_subsystems function Performs pathway over- and underrepresentation analysis Parameters ---------- model: cobra.Model solpath: str file containing DEXOM solutions outpath: str path to which results are saved Returns ------- over, under: pandas.DataFrames (saved as .csv files) containing -log10 BH-adjusted p-values """ df = pd.read_csv(solpath, dtype=int, index_col=0) if df.columns[0] not in model.reactions: df.columns = [r.id for r in model.reactions] pvalsu = {} pvalso = {} for path in model.groups: tempu = [] tempo = [] for x in df.iterrows(): sol = sum(x[1][r.id] for r in path.members) table = np.array([[sol, len(path.members) - sol], [sum(x[1]) - sol, len(model.reactions) - len(path.members) - sum(x[1]) + sol]]) o, pu = fisher_exact(table, alternative='less') o, po = fisher_exact(table, alternative='greater') tempu.append(pu) tempo.append(po) pvalsu[path.name] = tempu pvalso[path.name] = tempo over = pd.DataFrame(pvalso) under = pd.DataFrame(pvalsu) t, fdr = fdrcorrection(over.values.flatten()) over = pd.DataFrame(-np.log10(fdr).reshape(over.shape), columns=over.columns) t, fdr = fdrcorrection(under.values.flatten()) under = pd.DataFrame(-np.log10(fdr).reshape(under.shape), columns=under.columns) over.to_csv(outpath+'pathways_pvalues_over.csv') under.to_csv(outpath+'pathways_pvalues_under.csv') return over, under
[docs]def Fisher_subsystems(solpath, subframe, sublist, outpath='Fisher_subsystems'): """ !!! This only works if the pathway name is stored in the model.reaction.subsystem property !!! For models where the pathways are stored in the model.groups property, use the Fischer_groups function Performs pathway over- and underrepresentation analysis Parameters ---------- solpath: str file containing DEXOM solutions subframe: pandas.DataFrame dataframe with 'ID' column containing reaction IDs and one 'subsystem' column containing pathways sublist: list list of subsystems outpath: str path to which results are saved Returns ------- over, under: pandas.DataFrames (saved as .csv files) containing -log10 BH-adjusted p-values """ df = pd.read_csv(solpath, dtype=int, index_col=0) rxn_list = [] rxnnumber = {} for sub in sublist: rxns = subframe.index[subframe['subsystem'] == sub].to_list() rxn_list.append(rxns) rxnnumber[sub] = len(rxns) sol_pathways = [] for x in df.iterrows(): sol_pathways.append([sum(x[1][r]) for r in rxn_list]+[sum(x[1])]) pvalsu = {} pvalso = {} for i, sub in enumerate(sublist): tempu = [] tempo = [] for sol in sol_pathways: table = np.array([[sol[i], len(rxn_list[i]) - sol[i]], [sol[-1] - sol[i], len(subframe) - len(rxn_list[i]) - sol[-1] + sol[i]]]) o, pu = fisher_exact(table, alternative='less') o, po = fisher_exact(table, alternative='greater') tempu.append(pu) tempo.append(po) pvalsu[sub] = tempu pvalso[sub] = tempo over = pd.DataFrame(pvalso) under = pd.DataFrame(pvalsu) t, fdr = fdrcorrection(over.values.flatten()) over = pd.DataFrame(-np.log10(fdr).reshape(over.shape), columns=over.columns) t, fdr = fdrcorrection(under.values.flatten()) under = pd.DataFrame(-np.log10(fdr).reshape(under.shape), columns=under.columns) over.to_csv(outpath+'pathways_pvalues_over.csv') under.to_csv(outpath+'pathways_pvalues_under.csv') return over, under
[docs]def plot_Fisher_pathways(filename_over, filename_under, sublist, outpath='pathway_enrichment'): plt.ioff() over = pd.read_csv(filename_over, index_col=0) under = pd.read_csv(filename_under, index_col=0) over.columns = sublist under.columns = sublist over = over.sort_index(axis=1, ascending=False) under = under.sort_index(axis=1, ascending=False) plt.clf() fig, ax = plt.subplots(figsize=(11, 20)) rcParams['ytick.labelsize'] = 13 over.boxplot(vert=False, widths=0.7) plt.plot(list(over.values)[0], ax.get_yticks(), 'ro') plt.tight_layout() plt.subplots_adjust(top=0.95, bottom=0.05) plt.title('Overrepresentation analysis of active reactions per pathway', fontsize=15, loc='right', pad='20') plt.xlabel('-log10 adj. p-value', fontsize=15) plt.axvline(2.301, color='b') fig.savefig(outpath+'pathways_overrepresentation.png') plt.clf() fig, ax = plt.subplots(figsize=(11, 20)) rcParams['ytick.labelsize'] = 13 under.boxplot(vert=False, widths=0.7) plt.plot(list(under.values)[0], ax.get_yticks(), 'ro') plt.tight_layout() plt.subplots_adjust(top=0.95, bottom=0.05) plt.title('Underrepresentation analysis of active reactions per pathway', fontsize=15, loc='right', pad='20') plt.xlabel('-log10 adj. p-value', fontsize=15) plt.axvline(2.301, color='b') fig.savefig(outpath+'pathways_underrepresentation.png') return over, under
def _main(): """ This function is called when you run this script from the commandline. It performs pathway enrichment analysis using a hypergeometric test (Fischer exact test) Use --help to see commandline parameters """ description = 'Performs pathway enrichment analysis using a hypergeometric test (Fischer exact test)' parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-s', '--solutions', default=argparse.SUPPRESS, help='csv file containing enumeration solutions') parser.add_argument('-m', '--model', default=None, help='Metabolic model in sbml, json, or matlab format, ' 'required if subframe & sublist are absent') parser.add_argument('--sublist', default=None, help='List of all pathways/subsystems in the model') parser.add_argument('--subframe', default=None, help='csv file assigning reactions to pathways/subsystemts') parser.add_argument('-o', '--out_path', default='', help='Path to which the output file is written') args = parser.parse_args() groups = False if args.model is not None: model = read_model(args.model) if len(model.groups) > 0: groups = True sublist = [g.name for g in model.groups] else: subframe, sublist = get_subsystems_from_model(model, save=True, out_path=args.out_path) else: subframe = pd.read_csv(args.subframe, index_col=0) sublist = pd.read_csv(args.sublist, sep=';').columns.to_list() if groups: Fisher_groups(model=model, solpath=args.solutions, outpath=args.out_path) else: Fisher_subsystems(solpath=args.solutions, subframe=subframe, sublist=sublist, outpath=args.out_path) plot_Fisher_pathways(filename_over=args.out_path+'pathways_pvalues_over.csv', sublist=sublist, filename_under=args.out_path+'pathways_pvalues_under.csv', outpath=args.out_path) return True if __name__ == '__main__': _main()