Source code for dexom_python.enum_functions.enumeration


import os
import argparse
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from cobra.io import load_json_model, load_matlab_model, read_sbml_model
from dexom_python.result_functions import read_solution
from dexom_python.imat_functions import create_new_partial_variables, create_full_variables
from scipy.spatial.distance import pdist, squareform
from dexom_python.model_functions import get_all_reactions_from_model


[docs]class EnumSolution(object):
    """
    class for solutions of enumeration methods

    Parameters
    ----------
    solutions: list
        list of pandas dataframes containing flux values with reaction ids as index
    binary: list
        list containing binary arrays of reaction activity (0 for inactive, 1 for active)
    objective_value: float
        objective value returned by the solver at the end of the optimization
    """
    def __init__(self, solutions, binary, objective_value):
        """

        Parameters
        ----------
        solutions: list
            list of pandas dataframes containing flux values with reaction ids as index
        binary: list
            list containing binary arrays of reaction activity (0 for inactive, 1 for active)
        objective_value: float
            objective value returned by the solver at the end of the optimization
        """
        self.solutions = solutions
        self.binary = binary
        self.objective_value = objective_value


[docs]def create_enum_variables(model, reaction_weights, eps=1e-2, thr=1e-5, full=False):
    for rid in reaction_weights.keys():
        if reaction_weights[rid] == 0:
            pass
        elif full and "x_"+rid not in model.solver.variables:
            model = create_full_variables(model=model, reaction_weights=reaction_weights, epsilon=eps, threshold=thr)
            break
        elif not full and "rh_"+rid+"_pos" not in model.solver.variables and "rl_"+rid not in model.solver.variables:
            model = create_new_partial_variables(model=model, reaction_weights=reaction_weights, epsilon=eps,
                                                 threshold=thr)  # uses new variable implementation
            break
    return model


[docs]def get_recent_solution_and_iteration(dirpath, startsol_num):
    paths = sorted(list(Path(dirpath).glob("*solution_*.csv")), key=os.path.getctime)
    paths.reverse()
    solpath = paths[int(np.random.exponential(5))]
    solution, binary = read_solution(solpath)
    iteration = len(paths) + 1 - startsol_num
    return solution, iteration


[docs]def write_rxn_enum_script(directory, modelfile, weightfile, reactionlist, imatsol, username, eps=1e-4, thr=1e-5,
                          tol=1e-8, iters=100, maxiters=1e10):
    with open(reactionlist, "r") as file:
        rxns = file.read().split("\n")
    n_max = len(rxns) if len(rxns) < maxiters else maxiters
    rxn_num = (n_max // iters) + 1
    for i in range(rxn_num):
        with open(directory+"/rxn_file_" + str(i) + ".sh", "w+") as f:
            f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                    '#SBATCH -t 05:00:00\n#SBATCH -J rxn_%i\n#SBATCH -o rxnout_%i.out\n#SBATCH -e rxnerr_%i.out\n'
                    % (i, i, i))
            f.write('cd /home/%s/work/dexom_py\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                    'activate\nexport PYTHONPATH=${PYTHONPATH}:"/home/%s/work/CPLEX_Studio1210/cplex/python/3.7'
                    '/x86-64_linux"\n' % (username, username))
            f.write('python dexom_python/enum_functions/rxn_enum_functions.py -o %s/rxn_enum_%i --range %i_%i -m %s -r %s -l %s '
                    '-p %s -t 6000 --save -e %s --threshold %s --tol %s\n' % (directory, i, i*iters, i*iters+iters,
                    modelfile, weightfile, reactionlist, imatsol, eps, thr, tol))
    with open(directory+"/rxn_runfiles.sh", "w+") as f:
        f.write('#!/bin/bash\n#SBATCH --mail-type=ALL\n#SBATCH -J runfiles\n#SBATCH -o runout.out\n#SBATCH '
                '-e runerr.out\ncd $SLURM_SUBMIT_DIR\nfor i in {0..%i}\ndo\n    dos2unix file_"$i".sh\n    sbatch'
                ' file_"$i".sh\ndone' % (rxn_num-1))


[docs]def write_batch_script_divenum(directory, username, modelfile, weightfile, rxnsols, objtol, eps=1e-4, thr=1e-5,
                               tol=1e-8, filenums=100, iters=100, t=6000):
    for i in range(filenums):
        with open(directory+"file_"+str(i)+".sh", "w+") as f:
            f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                    '#SBATCH -t 05:00:00\n#SBATCH -J dexom1_%i\n#SBATCH -o dex1out%i.out\n#SBATCH -e dex1err%i.out\n'
                    % (i, i, i))
            f.write('cd /home/%s/work/dexom-python\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                    'activate\nexport PYTHONPATH=${PYTHONPATH}:"/home/%s/save/CPLEX_Studio1210/cplex/python/3.7'
                    '/x86-64_linux"\n' % (username, username))
            a = (1-1/(filenums*2*(iters/10)))**i
            f.write('python dexom_python/enum_functions/diversity_enum_functions.py -o %sdiv_enum_%i -m %s -r %s -p '
                    '%s%s_solution_%i.csv -a %.5f -i %i --obj_tol %.4f -e %s --threshold %s --tol %s -t %i'
                    % (directory, i, modelfile, weightfile, directory, rxnsols, i, a, iters, objtol, eps, thr, tol, t))
    with open(directory+"runfiles.sh", "w+") as f:
        f.write('#!/bin/bash\n#SBATCH --mail-type=ALL\n#SBATCH -J runfiles\n#SBATCH -o runout.out\n#SBATCH '
                '-e runerr.out\ncd $SLURM_SUBMIT_DIR\nfor i in {0..%i}\ndo\n    dos2unix file_"$i".sh\n    sbatch'
                ' file_"$i".sh\ndone' % (filenums-1))
    return True


[docs]def write_batch_script1(directory, username, modelfile, weightfile, reactionlist, imatsol, objtol, filenums=100, iters=100):
    for i in range(filenums):
        with open(directory+"file_"+str(i)+".sh", "w+") as f:
            f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                    '#SBATCH -t 12:00:00\n#SBATCH -J dexom1_%i\n#SBATCH -o dex1out%i.out\n#SBATCH -e dex1err%i.out\n'
                    % (i, i, i))
            f.write('cd /home/%s/work/dexom-python\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                    'activate\nexport PYTHONPATH=${PYTHONPATH}:"/home/%s/save/CPLEX_Studio1210/cplex/python/3.7'
                    '/x86-64_linux"\n' % (username, username))
            f.write('python dexom_python/enum_functions/rxn_enum_functions.py -o %srxn_enum_%i --range %i_%i -m %s -r %s -l %s -p %s '
                    '-t 600 --save\n' % (directory, i, i*5, i*5+5, modelfile, weightfile, reactionlist, imatsol))
            a = (1-1/(filenums*2*(iters/10)))**i
            f.write('python dexom_python/enum_functions/diversity_enum_functions.py -o %sdiv_enum_%i -m %s -r %s -p '
                    '%srxn_enum_%i_solution_0.csv -a %.5f -i %i --obj_tol %.4f'
                    % (directory, i, modelfile, weightfile, directory, i, a, iters, objtol))
    with open(directory+"runfiles.sh", "w+") as f:
        f.write('#!/bin/bash\n#SBATCH --mail-type=ALL\n#SBATCH -J runfiles\n#SBATCH -o runout.out\n#SBATCH '
                '-e runerr.out\ncd $SLURM_SUBMIT_DIR\nfor i in {0..%i}\ndo\n    dos2unix file_"$i".sh\n    sbatch'
                ' file_"$i".sh\ndone' % (filenums-1))
    return True


[docs]def write_batch_script2(filenums):
    """
    Warning: this function has not been updated with recent changes to DEXOM
    """
    paths = sorted(list(Path("parallel_approach2/").glob("*solution_*.csv")), key=os.path.getctime)
    paths.reverse()
    for i in range(filenums):
        with open("parallel_approach2/rxnstart_"+str(i)+".sh", "w+") as f:
            f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                    '#SBATCH -t 00:05:00\n#SBATCH -J dexom2_%i\n#SBATCH -o dex2out%i.out\n#SBATCH -e dex2err%i.out\n'
                    % (i, i, i))
            f.write('cd /home/mstingl/work/dexom_py\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                    'activate\nexport PYTHONPATH=${PYTHONPATH}:"/home/mstingl/work/CPLEX_Studio1210/cplex/python/3.7'
                    '/x86-64_linux"\n')
            sol = str(paths[i]).replace("\\", "/")
            f.write('python dexom_python/enum_functions/diversity_enum_functions.py -o parallel_approach2/div_enum_%i_0 -m '
                    'min_iMM1865/min_iMM1865.xml -r min_iMM1865/p53_deseq2_cutoff_padj_1e-6.csv -p %s -i 1 -a 0.99 '
                    '--save --full' % (i, sol))
    with open("parallel_approach2/dexomstart.sh", "w+") as f:
        f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                '#SBATCH -t 01:00:00\n')
        f.write('cd /home/mstingl/work/dexom_py\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                'activate\nexport PYTHONPATH=${PYTHONPATH}:"/home/mstingl/work/CPLEX_Studio1210/cplex/python/3.7'
                '/x86-64_linux"\n')
        f.write('python dexom_python/enum_functions/diversity_enum_functions.py -o parallel_approach2/div_enum -m '
                'min_iMM1865/min_iMM1865.xml -r min_iMM1865/p53_deseq2_cutoff_padj_1e-6.csv -p parallel_approach2 '
                '-i 1 -a 0.99 -s 100 --save --full')
    with open("parallel_approach2/rundexoms.sh", "w+") as f:
        f.write('#!/bin/bash\n#SBATCH --mail-type=ALL\n#SBATCH -J rundexoms\n#SBATCH -o runout.out\n#SBATCH '
                '-e runerr.out\ncd $SLURM_SUBMIT_DIR\nfor i in {0..%i}\ndo\n    dos2unix rxnstart_"$i".sh\n    sbatch '
                'rxnstart_"$i".sh\ndone\ndos2unix dexomstart.sh\nfor i in {0..%i}\ndo\n    sbatch -J dexomiter_"$i" '
                '-o dexout_"$i".out -e dexerr_"$i".out dexomstart.sh \ndone' % (filenums-1, filenums-1))
    return True


[docs]def analyze_div_enum_results(result_path, solution_path, out_path):
    """
    This function calculates the average pairwise hamming distance and average next neighbour distance
    for each iteration - it's very slow

    Parameters
    ----------
    result_path: csv results file from diversity-enum
    solution_path: csv solution file from diversity-enum
    out_path: path for saving

    Returns
    -------

    """

    res = pd.read_csv(result_path, index_col=0)
    sol = pd.read_csv(solution_path, index_col=0)

    unique = len(sol.drop_duplicates())
    print("There are %i unique solutions and %i duplicates" % (unique, len(sol)-unique))

    time = res["time"].cumsum()
    print("Total computation time: %i s" % time.iloc[-1])
    print("Average time per iteration: %i s" % (time.iloc[-1]/len(sol)))

    fig = time.plot().get_figure()
    fig.savefig(out_path + "_cumulated_time.png")
    plt.clf()
    fig = res["selected reactions"].plot().get_figure()
    fig.savefig(out_path + "_selected_reactions.png")

    sol = sol.drop_duplicates()
    avg_pairwise = []
    avg_nearest = []

    for i in range(2, len(sol) + 1):
        distances = pdist(sol[:i].values, metric="hamming")
        avg_pairwise.append(distances.mean())
        dist_mat = squareform(distances)
        avg_nearest.append(sum([min(x[np.nonzero(x)]) for x in dist_mat])/i)

    x = range(len(avg_pairwise))

    plt.clf()
    plt.plot(x, avg_pairwise, 'r')
    plt.savefig(out_path + "_avg_pairwise.png")
    plt.clf()
    plt.plot(x, avg_nearest, 'g')
    plt.savefig(out_path + "_avg_nearest_neighbor.png")
    plt.clf()
    fig = time.plot().get_figure()
    fig.savefig(out_path + "_cumulated_time.png")
    plt.clf()
    fig = res["selected reactions"].plot().get_figure()
    fig.savefig(out_path + "_selected_reactions.png")
    return sol.T


if __name__ == "__main__":
    description = "Writes batch scripts for launching DEXOM on a slurm cluster. Note that default parameters are used."

    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("-o", "--out_path", default="", help="Path to which the files are written")
    parser.add_argument("-u", "--username", help="username on the slurm cluster")
    parser.add_argument("-m", "--model", default=None, help="Metabolic model in sbml, json, or matlab format")
    parser.add_argument("-r", "--reaction_weights", default=None,
                        help="Reaction weights in csv format (first row: reaction names, second row: weights)")
    parser.add_argument("-l", "--reaction_list", default=None, help="shuffled list of reactions in the model")
    parser.add_argument("-p", "--prev_sol", help="starting solution [not optional here]")
    parser.add_argument("--obj_tol", type=float, default=1e-2,
                        help="objective value tolerance, as a fraction of the original value")
    parser.add_argument("-n", "--filenums", type=int, default=100, help="number of parallel threads")
    parser.add_argument("-i", "--iterations", type=int, default=100, help="number of div-enum iterations per thread")

    args = parser.parse_args()

    if args.reaction_list:
        reactionlist = args.reaction_list
    else:
        fileformat = Path(args.model).suffix
        if fileformat == ".sbml" or fileformat == ".xml":
            model = read_sbml_model(args.model)
        elif fileformat == '.json':
            model = load_json_model(args.model)
        elif fileformat == ".mat":
            model = load_matlab_model(args.model)
        else:
            print("Only SBML, JSON, and Matlab formats are supported for the models")
            model = None
        get_all_reactions_from_model(model, save=True, shuffle=True, out_path=args.out_path)
        reactionlist = args.out_path+model.id+"_reactions_shuffled.csv"