Source code for dexom_python.enum_functions.permutation_functions

import pandas as pd
import numpy as np
import argparse
import dexom_python as dp
from dexom_python.imat_functions import ImatException
from dexom_python.gpr_rules import apply_gpr
from dexom_python.default_parameter_values import DEFAULT_VALUES


[docs]def permute_genelabels(model, allgenes, geneindex=None, nperms=DEFAULT_VALUES['maxiter'], error_tol=DEFAULT_VALUES['maxiter']):
    """
    This function performs a permutation test in which the labels of gene expression values are randomly shuffled,
    and then an iMAT solution is computed with the new geneweights.

    Parameters
    ----------
    model: cobra.Model
        a cobrapy model
    allgenes: pandas.Series
        index = gene IDs and values = expression values
    geneindex: pandas.Series or None
        index = model gene IDs and values = allgenes gene IDs
    nperms: int
        number of permutations to perform
    error_tol: int
        maximum number of consecutive failed iterations before interrupting the program

    Returns
    -------
    perm_solutions: list of imat solutions
    perm_binary: pandas.Dataframe of binary solutions
    perm_recs: pandas.Dataframe of reaction-weights
    perm_genes: pandas.Dataframe of permuted gene expression values
    """
    rng = np.random.default_rng()
    # geneweights = geneindex.replace(allgenes)
    if geneindex is not None:
        geneweights = geneindex.map(allgenes).dropna()
    else:
        geneweights = allgenes.dropna()

    reaction_weights = apply_gpr(model=model, gene_weights=geneweights, save=False)
    perm_genes = [geneweights.values]
    perm_recs = [reaction_weights.values()]
    perm_solutions = []
    perm_binary = []

    i = 0
    consecutive_errors = 0
    while i < nperms and consecutive_errors < error_tol:
        print('starting iteration %i' % (i + 1))
        rng.shuffle(allgenes.values)
        # gw = geneindex.replace(allgenes)
        if geneindex is not None:
            gw = geneindex.map(allgenes).dropna()
        else:
            gw = allgenes.dropna()
        reaction_weights = apply_gpr(model=model, gene_weights=gw, save=False)
        if reaction_weights in perm_recs:
            # if len(pd.concat([perm_recs, pd.Series(reaction_weights)], axis=1).T.drop_duplicates()) <= i:
            # if (perm_recs == pd.Series(reaction_weights)).all(axis=1).sum()>0:
            continue  # if the same reaction weights were already generated in a previous loop, skip this iteration
        try:
            solution = dp.imat(model, reaction_weights)
            thr = DEFAULT_VALUES['threshold']
            tol = DEFAULT_VALUES['tolerance']
            solution_binary = (np.abs(solution.fluxes) >= thr - tol).values.astype(int)
            perm_genes.append(gw.values)
            perm_recs.append(reaction_weights.values())
            perm_solutions.append(solution)
            perm_binary.append(solution_binary)
            i += 1
            consecutive_errors = 0
        except ImatException:
            print('imat in iteration %i failed' % (i+1))
            consecutive_errors += 1
    if consecutive_errors >= error_tol:
        print('Permutations aborted due to too many consecutive failed iterations. '
              'The results of the %i successful iterations will be returned now.' % len(perm_solutions))
    perm_genes = pd.DataFrame(perm_genes, columns=geneweights.index, index=['start']+list(range(len(perm_genes)-1))).T
    perm_recs = pd.DataFrame(perm_recs, columns=reaction_weights.keys(), index=['start']+list(range(len(perm_recs)-1))).T
    perm_binary = pd.DataFrame(perm_binary, columns=[r.id for r in model.reactions])
    return perm_solutions, perm_binary, perm_recs.drop('start', axis=1), perm_genes.drop('start', axis=1)


def _main():
    """
    This function is called when you run this script from the commandline.
    It performs the permutation test.
    Use --help to see commandline parameters
    """
    description = ('Performs gene label permutation. In each loop the gene expression values are randomly shuffled '
                   'before computing an iMAT solution')
    parser = argparse.ArgumentParser(description=description,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-m', '--model', default=argparse.SUPPRESS,
                        help='Metabolic model in sbml, json, or matlab format')
    parser.add_argument('-g', '--gene_file', default=argparse.SUPPRESS,
                        help='csv file with gene IDs in the first column & gene expression values in the second')
    parser.add_argument('-n', '--npermutations', type=int, default=DEFAULT_VALUES['maxiter'],
                        help='Number of permutations to perform')
    parser.add_argument('-i', '--gene_index', default='false',
                        help='Define this parameter if your genefile uses different gene IDs than the model. '
                             'csv file in which the first column contains gene IDs from the metabolic model, '
                             'and the second column contains gene IDs from the genefile.')
    parser.add_argument('-o', '--output', default='perms', help='Path of the output file, without file format')
    parser.add_argument('-e', '--error_tol', type=int, default=DEFAULT_VALUES['maxiter'],
                        help='Maximum number of consecutive failed iterations before interrupting the program')
    args = parser.parse_args()
    model = dp.read_model(args.model)
    model = dp.check_model_options(model)

    gwfile = pd.read_csv(args.gene_file, sep=';|,|\t', engine='python', index_col=0)
    allgenes = gwfile[gwfile.columns[0]]

    if args.gene_index.lower() == 'false':
        geneindex = pd.Series(allgenes.index, index=allgenes.index)
    else:
        gifile = pd.read_csv(args.gene_index, sep=';|,|\t', engine='python')
        geneindex = gifile[gifile.columns[0]]

    n = args.npermutations
    e = args.error_tol

    sols, bins, recs, genes = permute_genelabels(model=model, allgenes=allgenes, geneindex=geneindex, nperms=n,
                                                 error_tol=e)
    bins.to_csv(args.output + '_solutions.csv')
    recs.to_csv(args.output + '_reactionweights.csv')
    genes.to_csv(args.output + '_geneweights.csv')
    flux = pd.concat([s.fluxes for s in sols], axis=1).T.reset_index(drop=True)
    flux.to_csv(args.output + '_fluxes.csv')
    return True


if __name__ == '__main__':
    _main()