Source code for dexom_python.gpr_rules


import re
import argparse
import numpy as np
import pandas as pd
from symengine import sympify, Add, Mul, Max, Min
from dexom_python.model_functions import read_model, save_reaction_weights


[docs]def replace_MulMax_AddMin(expression):
    if expression.is_Atom:
        return expression
    else:
        replaced_args = (replace_MulMax_AddMin(arg) for arg in expression.args)
        if expression.__class__ == Mul:
            return Max(*replaced_args)
        elif expression.__class__ == Add:
            return Min(*replaced_args)
        else:
            return expression.func(*replaced_args)


[docs]def expression2qualitative(expression_file, column_idx=-1, percentage=25, method="keep", save=True, outpath="geneweights"):
    """

    Parameters
    ----------
    expression_file: str
        path to file containing gene IDs in the first column and gene expression values in a later column
    column_idx: int
        column indexes containing gene expression values to be transformed. If -1, all columns will be transformed
    percentage: float
        percentage of genes to be used for determining high/low gene expression
    method: str
        one of "max", "mean" or "keep". chooses how to deal with genes containing multiple conflicting expression values
    save: bool
        if True, saves the resulting gene weights
    outpath: str
        if save=True, the .csv file will be saved to this path

    Returns
    -------

    """

    df = pd.read_csv(expression_file, index_col=0)

    if column_idx == -1:
        column_idx = list(range(len(df.columns)))
    elif type(column_idx) == int:
        column_idx = list(range(column_idx))

    cutoff = 1/(percentage/100)
    colnames = df.columns[column_idx]
    for col in colnames:
        if method == "max":
            for x in set(df.index):
                df[col][x] = df[col][x].max()
        elif method == "mean":
            for x in set(df.index):
                df[col][x] = df[col][x].mean()

        df.sort_values(col, inplace=True)
        df[col].iloc[:int(len(df)//cutoff)+1] = -1.
        df[col].iloc[int(len(df)//cutoff)+1:int(len(df)*(cutoff-1)//cutoff)] = 0.
        df[col].iloc[int(len(df) * (cutoff-1) // cutoff):] = 1.
    if save:
        df.to_csv(outpath+".csv")
    return df


[docs]def prepare_expr_split_gen_list(rxn, modelname):
    """

    Parameters
    ----------
    rxn: cobra.Reaction
    modelname: str
        The name of the model. Currently only supports human1, recon1, recon2, iMM1865, zebrafish1

    Returns
    -------

    """
    if modelname == "recon2":
        expr_split = rxn.gene_reaction_rule.replace("(", "( ").replace(")", " )").split()
        expr_split = [s.replace(':', '_') if ':' in s else s for s in expr_split]
        rxngenes = re.sub('and|or|\(|\)', '', rxn.gene_reaction_rule).split()
        gen_list = set([s.replace(':', '_') for s in rxngenes if ':' in s])
    elif modelname == "recon1":
        expr_split = rxn.gene_reaction_rule.replace("(", "( ").replace(")", " )").split()
        expr_split = ["g_" + s[:-4] if '_' in s else s for s in expr_split]
        rxngenes = re.sub('and|or|\(|\)', '', rxn.gene_reaction_rule).split()
        gen_list = set(["g_" + s[:-4] for s in rxngenes if '_' in s])
    elif modelname == "iMM1865":
        expr_split = rxn.gene_reaction_rule.split()
        expr_split = ["g_" + s if s.isdigit() else s for s in expr_split]
        rxngenes = re.sub('and|or|\(|\)', '', rxn.gene_reaction_rule).split()
        gen_list = set(["g_" + s for s in rxngenes if s.isdigit()])
    elif modelname == "human1":
        expr_split = rxn.gene_reaction_rule.replace("(", "( ").replace(")", " )").split()
        gen_list = set([g.id for g in rxn.genes])
    elif modelname == "zebrafish1":
        expr_split = rxn.gene_reaction_rule.replace("(", "( ").replace(")", " )").split()
        expr_split = [re.sub(':|\.|-', '_', s) for s in expr_split]
        gen_list = set([re.sub(':|\.|-', '_', g.id) for g in rxn.genes])
    else:
        print("Modelname not found")
        expr_split = None
        gen_list = None

    return expr_split, gen_list


[docs]def apply_gpr(model, gene_weights, modelname, save=True, filename="reaction_weights"):
    """
    Applies the GPR rules from a given metabolic model for creating reaction weights

    Parameters
    ----------
    model: cobra.Model
        a cobrapy model
    gene_weights: dict
        a dictionary containing gene IDs & weights
    modelname: str
        the name of the model
    save: bool
        if True, saves the reaction weights as a csv file

    Returns
    -------
    reaction_weights: dict where keys = reaction IDs and values = weights
    """
    reaction_weights = {}
    for rxn in model.reactions:
        if len(rxn.genes) > 0:
            expr_split, gen_list = prepare_expr_split_gen_list(rxn, modelname)
            new_weights = {g: gene_weights.get(g, 0) for g in gen_list}
            negweights = []
            for g, v in new_weights.items():
                if v < 0:
                    new_weights[g] = -v - 1e-15
                    negweights.append(-v)
            expression = ' '.join(expr_split).replace(' or ', ' * ').replace(' and ', ' + ')
            # weight = sympify(expression).xreplace({Mul: Max}).xreplace({Add: Min})
            weight = replace_MulMax_AddMin(sympify(expression)).subs(new_weights)
            if weight + 1e-15 in negweights:
                weight = -weight - 1e-15
            reaction_weights[rxn.id] = weight
        else:
            reaction_weights[rxn.id] = 0.
    if save:
        save_reaction_weights(reaction_weights, filename+".csv")
    return reaction_weights


if __name__ == "__main__":

    description = "Applies GPR rules to transform gene weights into reaction weights"

    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("-m", "--model", help="GEM in json, sbml or matlab format")
    parser.add_argument("-n", "--modelname", help="supported: human1, recon1, recon2, iMM1865, zebrafish1")
    parser.add_argument("-g", "--gene_file", help="csv file containing gene identifiers and scores")
    parser.add_argument("-o", "--output", default="reaction_weights",
                        help="Path to which the reaction_weights .csv file is saved")
    parser.add_argument("--gene_ID", default="ID", help="column containing the gene identifiers")
    parser.add_argument("--gene_score", default="t", help="column containing the gene scores")
    parser.add_argument("--convert", action='store_true', help="converts gene expression to qualitative weights")
    parser.add_argument("-t", "--threshold", type=float, default=.25,
                        help="proportion of genes that are highly/lowly expressed (only used if --convert is selected)")
    args = parser.parse_args()

    model = read_model(args.model)
    model_list = ["human1", "recon1", "recon2", "iMM1865", "zebrafish1"]

    genes = pd.read_csv(args.gene_file)
    if args.convert:
        genes = expression2qualitative(args.gene_file, percentage=args.threshold*100,
                                       outpath=args.output+"_qual_geneweights")
    gene_weights = pd.Series(genes[args.gene_score].values, index=genes[args.gene_ID])

    # current behavior: all genes with several different weights are removed
    for x in set(gene_weights.index):
        if type(gene_weights[x]) != np.float64:
            if len(gene_weights[x].value_counts()) > 1:
                gene_weights.pop(x)
    gene_weights = gene_weights.to_dict()

    if args.modelname not in model_list:
        print("Unsupported model. The currently supported models are: human1, recon1, recon2, iMM1865, zebrafish1")
    else:
        reaction_weights = apply_gpr(model=model, gene_weights=gene_weights, modelname=args.modelname, save=True,
                                     filename=args.output)