Source code for dexom_python.cluster_utils.dexom_cluster_results

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import argparse


[docs]def analyze_dexom_cluster_results(in_folder, out_folder, approach=1, filenums=100): """ Parameters ---------- in_folder: str folder containing dexom results out_folder: str folder in which output files will be saved approach: int which parallelization approach was used (1, 2, or 3, see enum_functions/enumeration for details) filenums: int number of parallel dexom threads that were run Returns ------- """ output_file = [] # concatenating all .out files from the cluster if approach == 1 or approach == 3: fileout = 'dex1out' fileerr = 'dex1err' with open(out_folder+'all_outs.txt', 'w+') as outfile: for i in range(filenums): fname = in_folder+fileout+str(i)+'.out' with open(fname) as infile: outfile.write(infile.read()) with open(out_folder+'all_errs.txt', 'w+') as outfile: for i in range(filenums): fname = in_folder+fileerr+str(i)+'.out' with open(fname) as infile: outfile.write(infile.read()) elif approach == 2: outfiles = Path(in_folder).glob('*out*.out') errfiles = Path(in_folder).glob('*err*.out') with open(out_folder + 'all_outs.txt', 'w+') as outfile: for f in outfiles: with open(str(f)) as infile: outfile.write(infile.read()) with open(out_folder + 'all_errs.txt', 'w+') as outfile: for f in errfiles: with open(str(f)) as infile: outfile.write(infile.read()) #concatenating & analyzing rxn_enum results output_file.append('looking at rxn_enum') print(output_file[-1]) all_rxn = [] if approach == 3: rxn = pd.read_csv(in_folder+'rxn_enum_solutions.csv', index_col=0) else: for i in range(filenums): try: if approach == 1: filename = in_folder + 'rxn_enum_%i_solutions.csv' % i elif approach == 2: filename = Path(in_folder).glob('div_enum_%i_*_solutions.csv' % i) filename = str(list(filename)[0]) rxn = pd.read_csv(filename, index_col=0) all_rxn.append(rxn) except FileNotFoundError: pass # if a file is absent, ignore it rxn = pd.concat(all_rxn, ignore_index=True) if approach == 1 or approach == 3: unique = len(rxn.drop_duplicates()) output_file.append('There are %i unique solutions and %i duplicates' % (unique, len(rxn) - unique)) print(output_file[-1]) fulltime = 0 counter = 0 with open(out_folder+'all_outs.txt', 'r') as file: for line in file: line = line.split() try: fulltime += float(line[0]) counter += 1 except (ValueError, IndexError): pass # ignore lines that are empty or don't begin with a number if counter != 0: output_file.append('Total computation time: %i s' % int(fulltime)) print(output_file[-1]) output_file.append('Average time per iteration: %.2f s' % (fulltime*2/counter)) print(output_file[-1]) elif approach == 2: all_res = [] for i in range(filenums): try: filename = Path(in_folder).glob('div_enum_%i_*_results.csv' % i) filename = str(list(filename)[0]) res = pd.read_csv(filename, index_col=0) all_res.append(res) except FileNotFoundError: pass rxn_res = pd.concat(all_res, ignore_index=True) rxn_res.to_csv(out_folder + 'all_rxn_enum_res.csv') # concatenating & analyzing diversity_enum results output_file.append('looking at diversity_enum') print(output_file[-1]) all_res = [] all_sol = [] if approach == 1 or approach == 3: for i in range(filenums): try: solname = in_folder + 'div_enum_%i_solutions.csv' % i resname = in_folder + 'div_enum_%i_results.csv' % i sol = pd.read_csv(solname, index_col=0) res = pd.read_csv(resname, index_col=0) all_sol.append(sol) all_res.append(res) except FileNotFoundError: pass elif approach == 2: solname = Path(in_folder).glob('div*_solutions.csv') all_sol = [pd.read_csv(str(x), index_col=0) for x in solname] resname = Path(in_folder).glob('div*_results.csv') all_res = [pd.read_csv(str(x), index_col=0) for x in resname] sol = pd.concat(all_sol, ignore_index=True) res = pd.concat(all_res, ignore_index=True) res.to_csv(out_folder+'all_divenum_res.csv') unique = len(sol.drop_duplicates()) output_file.append('There are %i unique solutions and %i duplicates' % (unique, len(sol)-unique)) print(output_file[-1]) time = res['time'].cumsum() output_file.append('Total computation time: %i s' % time.iloc[-1]) print(output_file[-1]) output_file.append('Average time per iteration: %.2f s' % (time.iloc[-1]/len(sol))) print(output_file[-1]) plt.clf() fig = res.sort_values('selected reactions').reset_index(drop=True)['selected reactions'].plot().get_figure() fig.savefig(out_folder+'all_divenum_selected_reactions_ordered.png') # analyzing total results output_file.append('total result') print(output_file[-1]) full = pd.concat([rxn, sol], ignore_index=True) unique = len(full.drop_duplicates()) output_file.append('There are %i unique solutions and %i duplicates' % (unique, len(full)-unique)) print(output_file[-1]) rxn = rxn.drop_duplicates(ignore_index=True) rxn.to_csv(out_folder+'all_rxnenum_sols.csv') sol = sol.drop_duplicates(ignore_index=True) sol.to_csv(out_folder+'all_divenum_sols.csv') full = full.drop_duplicates(ignore_index=True) full.to_csv(out_folder+'all_dexom_sols.csv') with open(out_folder+'output.txt', 'w+') as file: file.write('\n'.join(output_file)) return full
[docs]def main(): """ This function is called when you run this script from the commandline. It compiles and analyzes results from the DEXOM cluster pipelines Use --help to see commandline parameters """ description = 'Compiles and analyzes results from parallel DEXOM' parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--in_path', default='', help='Path in which the cluster results were saved') parser.add_argument('-o', '--out_path', default='', help='Path in which to save compiled results') parser.add_argument('-n', '--filenums', type=int, default=100, help='number of parallel threads') parser.add_argument('-a', '--approach', type=int, default=1, help='which parallelization approach was used (1 by default)') args = parser.parse_args() analyze_dexom_cluster_results(in_folder=args.in_path, out_folder=args.out_path, approach=args.approach, filenums=args.filenums)
if __name__ == '__main__': main()