Source code for dexom_python.enum_functions.enumeration

import os
import argparse
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dexom_python.result_functions import read_solution
from dexom_python.imat_functions import create_new_partial_variable_single, create_full_variable_single
from scipy.spatial.distance import pdist, squareform
from dexom_python.model_functions import read_model, get_all_reactions_from_model


[docs]class EnumSolution(object):
    """
    class for solutions of enumeration methods

    Parameters
    ----------
    solutions: list
        A list of pandas dataframes containing flux values with reaction ids as index
    binary: list
        A list containing binary arrays of reaction activity (0 for inactive, 1 for active)
    objective_value: float
        objective value returned by the solver at the end of the optimization
    """
    def __init__(self, solutions, binary, objective_value):
        self.solutions = solutions
        self.binary = binary
        self.objective_value = objective_value


[docs]def create_enum_variables(model, reaction_weights, eps=1e-2, thr=1e-5, full=False):
    for rid in reaction_weights.keys():
        if rid not in model.reactions:
            pass
        elif full and 'x_' + rid not in model.solver.variables:
            model = create_full_variable_single(model=model, rid=rid, reaction_weights=reaction_weights, epsilon=eps,
                                                threshold=thr)
        elif reaction_weights[rid] > 0 and 'rh_' + rid + '_pos' not in model.solver.variables:
            model = create_new_partial_variable_single(model=model, epsilon=eps, threshold=thr, rid=rid, pos=True)
        elif reaction_weights[rid] < 0 and 'rl_' + rid not in model.solver.variables:
            model = create_new_partial_variable_single(model=model, epsilon=eps, threshold=thr, rid=rid, pos=False)
    return model


[docs]def get_recent_solution_and_iteration(dirpath, startsol_num):
    """
    This functions fetches a solution from a given directory. The solutions are ordered by creation time, and one
    solution is picked using an exponential distribution (meaning that the most recent solution has the highest
    probability of being chosen)

    Parameters
    ----------
    dirpath: str
        a directory containing imat or enumeration solutions
    startsol_num: int
        the number of starting solutions present in the directory

    Returns
    -------
    solution: a Solution object
    iteration: int
        calculates the current iteration, based on how many solutions are already present in the folder
    """
    paths = sorted(list(Path(dirpath).glob('*solution*.csv')), key=os.path.getctime)
    paths.reverse()
    solpath = paths[int(np.random.exponential(5))]
    solution, binary = read_solution(solpath)
    iteration = len(paths) + 1 - startsol_num
    return solution, iteration


[docs]def write_rxn_enum_script(directory, modelfile, weightfile, imatsol=None, reactionlist=None, eps=1e-4, thr=1e-5,
                          tol=1e-8, iters=100, maxiters=1e10):
    os.makedirs(directory, exist_ok=True)
    if directory[-1] not in ['/', '\\']:
        directory += '/'
    if reactionlist is not None:
        with open(reactionlist, 'r') as file:
            rxns = file.read().split('\n')
        n_max = len(rxns) if len(rxns) < maxiters else maxiters
        rxn_num = (n_max // iters) + 1
        rstring = '-l ' + reactionlist
    else:
        rstring = ''
        model = read_model(modelfile)
        rxn_num = len(model.reactions)
    if imatsol is not None:
        istring = '-p ' + imatsol
    else:
        istring = ''
    for i in range(rxn_num):
        with open(directory+'/rxn_file_' + str(i) + '.sh', 'w+') as f:
            f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                    '#SBATCH -t 10:00:00\n#SBATCH -J rxn_%i\n#SBATCH -o rxnout_%i.out\n#SBATCH -e rxnerr_%i.out\n'
                    % (i, i, i))
            f.write('cd $SLURM_SUBMIT_DIR\ncd ..\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                    'activate\nexport PYTHONPATH=${PYTHONPATH}:"/home/%s/work/CPLEX_Studio1210/cplex/python/3.7'
                    '/x86-64_linux"\n')
            f.write('python dexom_python/enum_functions/rxn_enum_functions.py -o %s/rxn_enum_%i --range %i_%i -m %s -r '
                    '%s %s %s -t 6000 --save -e %s --threshold %s --tol %s\n'
                    % (directory, i, i*iters, i*iters+iters, modelfile, weightfile, rstring, istring, eps, thr, tol))
    with open(directory+'/rxn_runfiles.sh', 'w+') as f:
        f.write('#!/bin/bash\n#SBATCH --mail-type=ALL\n#SBATCH -J runfiles\n#SBATCH -o runout.out\n#SBATCH '
                '-e runerr.out\ncd $SLURM_SUBMIT_DIR\nfor i in {0..%i}\ndo\n    dos2unix file_"$i".sh\n    sbatch'
                ' file_"$i".sh\ndone' % (rxn_num-1))


[docs]def write_batch_script_divenum(directory, modelfile, weightfile, rxnsols, objtol, eps=1e-4, thr=1e-5,
                               tol=1e-8, filenums=100, iters=100, t=6000):
    os.makedirs(directory, exist_ok=True)
    if directory[-1] not in ['/', '\\']:
        directory += '/'
    for i in range(filenums):
        with open(directory+'file_'+str(i)+'.sh', 'w+') as f:
            f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                    '#SBATCH -t 05:00:00\n#SBATCH -J dexom1_%i\n#SBATCH -o dex1out%i.out\n#SBATCH -e dex1err%i.out\n'
                    % (i, i, i))
            f.write('cd $SLURM_SUBMIT_DIR\ncd ..\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                    'activate\nexport PYTHONPATH=${PYTHONPATH}:"/home/%s/save/CPLEX_Studio1210/cplex/python/3.7'
                    '/x86-64_linux"\n')
            a = (1-1/(filenums*2*(iters/10)))**i
            f.write('python dexom_python/enum_functions/diversity_enum_functions.py -o %sdiv_enum_%i -m %s -r %s -p '
                    '%s%s_solution_%i.csv -a %.5f -i %i --obj_tol %.4f -e %s --threshold %s --tol %s -t %i'
                    % (directory, i, modelfile, weightfile, directory, rxnsols, i, a, iters, objtol, eps, thr, tol, t))
    with open(directory+'runfiles.sh', 'w+') as f:
        f.write('#!/bin/bash\n#SBATCH --mail-type=ALL\n#SBATCH -J runfiles\n#SBATCH -o runout.out\n#SBATCH '
                '-e runerr.out\ncd $SLURM_SUBMIT_DIR\nfor i in {0..%i}\ndo\n    dos2unix file_"$i".sh\n    sbatch'
                ' file_"$i".sh\ndone' % (filenums-1))
    return True


[docs]def write_batch_script1(directory, modelfile, weightfile, cplexpath, reactionlist=None, imatsol=None, objtol=1e-2,
                        filenums=100, iters=100):
    """
    Writes bash scripts for dexom-python parallelization approach 1 on a slurm cluster. Within each batch,
    reaction-enumeration and diversity-enumeration are performed. These scripts assume that you have setup
    a virtual environment called env.

    Parameters
    ----------
    directory: str
        directory in which the files will be generated. If it does not exist, it will be created
    modelfile: str
        path to the model
    weightfile:
        path to the reaction weights
    cplexpath: str
        path to a cplex installation on the cluster
    reactionlist: str
        list of reactions for reaction-enumeration
    imatsol: str
        path to imat solution
    objtol: float
        objective tolerance
    filenums: int
        number of parallel batches
    iters: int
        number of diversity-enumeration iterations per batch
    """
    os.makedirs(directory, exist_ok=True)
    if directory[-1] not in ['/', '\\']:
        directory += '/'
    if reactionlist is not None:
        rstring = '-l ' + reactionlist
    else:
        rstring = ''
    if imatsol is not None:
        istring = '-p ' + imatsol
    else:
        istring = ''
    for i in range(filenums):
        with open(directory+'file_'+str(i)+'.sh', 'w+') as f:
            f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                    '#SBATCH -t 12:00:00\n#SBATCH -J dexom1_%i\n#SBATCH -o dex1out%i.out\n#SBATCH -e dex1err%i.out\n'
                    % (i, i, i))
            f.write('cd $SLURM_SUBMIT_DIR\ncd ..\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                    'activate\nexport PYTHONPATH=${PYTHONPATH}:"%s/x86-64_linux"\n' % cplexpath)
            f.write('python dexom_python/enum_functions/rxn_enum_functions.py -o %srxn_enum_%i --range %i_%i -m %s -r '
                    '%s %s %s -t 600 --save\n' % (directory, i, i*5, i*5+5, modelfile, weightfile, rstring, istring))
            a = (1-1/(filenums*2*(iters/10)))**i
            f.write('python dexom_python/enum_functions/diversity_enum_functions.py -o %sdiv_enum_%i -m %s -r %s -p '
                    '%srxn_enum_%i_solution_1.csv -a %.5f -i %i --obj_tol %.4f'
                    % (directory, i, modelfile, weightfile, directory, i, a, iters, objtol))
    with open(directory+'runfiles.sh', 'w+') as f:
        f.write('#!/bin/bash\n#SBATCH --mail-type=ALL\n#SBATCH -J runfiles\n#SBATCH -o runout.out\n#SBATCH '
                '-e runerr.out\ncd $SLURM_SUBMIT_DIR\nfor i in {0..%i}\ndo\n    dos2unix file_"$i".sh\n    sbatch'
                ' file_"$i".sh\ndone' % (filenums-1))
    return True


[docs]def write_batch_script2(directory, modelfile, weightfile, cplexpath, objtol=1e-2, filenums=100):
    """
    Writes bash scripts for dexom-python parallelization approach 2 on a slurm cluster. In this approach, indiviual
    diversity-enumeration iterations are laucnhed in each batch - this requires the existance of reaction-enumeration
    solutions beforehand. These scripts assume that you have setup a virtual environment called env.

    Parameters
    ----------
    directory: str
        directory in which the files will be generated
    modelfile: str
        path to the model
    weightfile:
        path to the reaction weights
    cplexpath: str
        path to a cplex installation on the cluster
    objtol: float
        objective tolerance
    filenums: int
        number of parallel batches
    """
    os.makedirs(directory, exist_ok=True)
    if directory[-1] not in ['/', '\\']:
        directory += '/'
    paths = sorted(list(Path(directory).glob('*solution_*.csv')), key=os.path.getctime)
    paths.reverse()
    for i in range(filenums):
        a = (1 - 1 / (filenums * 2 * (filenums / 10))) ** i
        with open(directory+'rxnstart_'+str(i)+'.sh', 'w+') as f:
            f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                    '#SBATCH -t 00:05:00\n#SBATCH -J dexom2_%i\n#SBATCH -o dex2out%i.out\n#SBATCH -e dex2err%i.out\n'
                    % (i, i, i))
            f.write('cd $SLURM_SUBMIT_DIR\ncd ..\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                    'activate\nexport PYTHONPATH=${PYTHONPATH}:"%s/x86-64_linux"\n' % cplexpath)
            sol = str(paths[i]).replace('\\', '/')
            f.write('python dexom_python/enum_functions/diversity_enum_functions.py -o %sdiv_enum_%i -m %s -r %s -p '
                    '%s -a %.5f -i 1 --obj_tol %.4f --save'
                    % (directory, i, modelfile, weightfile, sol, a, objtol))
    a = (1 - 1 / (filenums * 2 * (filenums / 10)))
    with open('parallel_approach2/dexomstart.sh', 'w+') as f:
        f.write('#!/bin/bash\n#SBATCH -p workq\n#SBATCH --mail-type=ALL\n#SBATCH --mem=64G\n#SBATCH -c 24\n'
                '#SBATCH -t 01:00:00\n')
        f.write('cd $SLURM_SUBMIT_DIR\ncd ..\nmodule purge\nmodule load system/Python-3.7.4\nsource env/bin/'
                'activate\nexport PYTHONPATH=${PYTHONPATH}:"%s/x86-64_linux"\n' % cplexpath)
        f.write('python dexom_python/enum_functions/diversity_enum_functions.py -o %sdiv_enum -m %s -r %s -p '
                '%s -a %.5f -i 1 -s %i --obj_tol %.4f --save'
                % (directory, modelfile, weightfile, sol, a, filenums, objtol))
    with open(directory+'rundexoms.sh', 'w+') as f:
        f.write('#!/bin/bash\n#SBATCH --mail-type=ALL\n#SBATCH -J rundexoms\n#SBATCH -o runout.out\n#SBATCH '
                '-e runerr.out\ncd $SLURM_SUBMIT_DIR\nfor i in {0..%i}\ndo\n    dos2unix rxnstart_"$i".sh\n    sbatch '
                'rxnstart_"$i".sh\ndone\ndos2unix dexomstart.sh\nfor i in {0..%i}\ndo\n    sbatch -J dexomiter_"$i" '
                '-o dexout_"$i".out -e dexerr_"$i".out dexomstart.sh \ndone' % (filenums-1, filenums-1))
    return True


[docs]def analyze_div_enum_results(result_path, solution_path, out_path):
    """
    This function calculates the average pairwise hamming distance and average next neighbour distance
    for each iteration - it's very slow

    Parameters
    ----------
    result_path: csv results file from diversity-enum
    solution_path: csv solution file from diversity-enum
    out_path: path for saving

    Returns
    -------

    """
    res = pd.read_csv(result_path, index_col=0)
    sol = pd.read_csv(solution_path, index_col=0)
    unique = len(sol.drop_duplicates())
    print('There are %i unique solutions and %i duplicates' % (unique, len(sol)-unique))
    time = res['time'].cumsum()
    print('Total computation time: %i s' % time.iloc[-1])
    print('Average time per iteration: %i s' % (time.iloc[-1]/len(sol)))
    fig = time.plot().get_figure()
    fig.savefig(out_path + '_cumulated_time.png')
    plt.clf()
    fig = res['selected reactions'].plot().get_figure()
    fig.savefig(out_path + '_selected_reactions.png')
    sol = sol.drop_duplicates()
    avg_pairwise = []
    avg_nearest = []
    for i in range(2, len(sol) + 1):
        distances = pdist(sol[:i].values, metric='hamming')
        avg_pairwise.append(distances.mean())
        dist_mat = squareform(distances)
        avg_nearest.append(sum([min(x[np.nonzero(x)]) for x in dist_mat])/i)
    x = range(len(avg_pairwise))
    plt.clf()
    plt.plot(x, avg_pairwise, 'r')
    plt.savefig(out_path + '_avg_pairwise.png')
    plt.clf()
    plt.plot(x, avg_nearest, 'g')
    plt.savefig(out_path + '_avg_nearest_neighbor.png')
    plt.clf()
    fig = time.plot().get_figure()
    fig.savefig(out_path + '_cumulated_time.png')
    plt.clf()
    fig = res['selected reactions'].plot().get_figure()
    fig.savefig(out_path + '_selected_reactions.png')
    return sol.T


if __name__ == '__main__':
    description = 'Writes batch scripts for launching DEXOM on a slurm cluster. Note that default values are used' \
                  'for most parameters. This also assumes that you have a virtual environment called env in your' \
                  'project directory'

    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-o', '--out_path', default='cluster/', help='Path to which the files are written. '
                                                                     'Has to be a folder in this directory')
    parser.add_argument('-m', '--model', default=None, help='Metabolic model in sbml, json, or matlab format')
    parser.add_argument('-r', '--reaction_weights', default=None,
                        help='Reaction weights in csv format (first row: reaction names, second row: weights)')
    parser.add_argument('-l', '--reaction_list', default=None, help='list of reactions in the model')
    parser.add_argument('-p', '--prev_sol', default=None, help='starting solution')
    parser.add_argument('-c', '--cplex_path', help='path to the cplex solver',
                        default='/home/mstingl/save/CPLEX_Studio1210/cplex/python/3.7/x86-64_linux')
    parser.add_argument('--obj_tol', type=float, default=1e-2,
                        help='objective value tolerance, as a fraction of the original value')
    parser.add_argument('-n', '--filenums', type=int, default=100, help='number of parallel threads')
    parser.add_argument('-i', '--iterations', type=int, default=100, help='number of div-enum iterations per thread')

    args = parser.parse_args()

    if args.reactionlist:
        reactionlist = args.reactionlist
    else:
        model = read_model(args.model)
        get_all_reactions_from_model(model, save=True, shuffle=True, out_path=args.out_path)
        reactionlist = args.out_path + model.id + '_reactions_shuffled.csv'

    write_batch_script1(args.out_path, args.model, args.reaction_weights, args.cplex_path, reactionlist,
                        args.prev_sol, args.obj_tol, args.filenums, args.iterations)