Multi-Dendrix Logo

Source code for multi_dendrix

#!/usr/bin/env python
import sys
try: import cplex
except ImportError:
    print "Couldn't import CPLEX. Check your environment's PYTHONPATH "\
          "variable. For details on the CPLEX python module, please visit "\
          "http://bit.ly/KL7PVc."
    sys.exit(3)

########################## Multi-Dendrix ILP ###################################
# Helper functions for the multi_dendrix ILP function
[docs]def V(tumors2genes, g, tumor): '''Returns true if gene g is mutated in the given tumor''' return int(g in tumors2genes[tumor])
[docs]def var_to_gene(var): '''Extracts the gene name from the variable name used in the ILP''' if var.count("->") > 0: return var.split("->")[0] else: return var
[docs]def path_num(var): '''Extracts the pathway number from a variable name, and corrects the index''' if var.count("->") > 0: try: return int(var.split("->")[1]) - 1 except ValueError: return 0 else: return 0
[docs]def update_stdout(message, verbose): '''Logs progress for ILP''' if verbose: print message, sys.stdout.flush() # CPLEX memory options (see http://goo.gl/dJV6F)
MEMORY_ONLY = 1 ON_DISK_COMPRESSED = 3 # Main function for Multi-Dendrix ILP
[docs]def multi_dendrix((genes, tumors2genes, genes2tumors), t=0, k_min=0, k_max=cplex.infinity, alpha=1.0, overlaps=0.0, t_per_gene=1.0, verbose=False, max_mem=None, max_time=None): ''' Implementation of Multi-Dendrix ILP. Requires installation of CPLEX Python. Input: genes, tumors2genes, genes2tumors - mutation data from load_mutation_data() below Options: t - number of pathways k_min - minimum pathway size k_max - maximum pathway size alpha - parameter that changes the weight function W by changing the tradeoff between coverage and coverage overlap overlaps - number of overlaps allowed between any pair of pathways t_per_gene - number of pathways any gene can be a member of verbose - outputs results to stdout Output: Writes pathways to file called "multi_dendrix_pathways.txt". Each line lists the weight and genes for each pathway. ''' update_stdout("Initializing ILP...", verbose) # Default parameters for ILP try: ilp, my_obj, my_sense, my_rhs = cplex.Cplex(), [], "L", [1] except cplex.exceptions.CplexSolverError: print 'Exiting because of CPLEX license issue.' sys.exit(2) # Set params for ILP ilp.set_results_stream(None) # Change this to have CPLEX output to stdout ilp.parameters.mip.strategy.file.set(MEMORY_ONLY) #if max_time: # ilp.parameters.timelimit.set(max_time) # if verbose: print 'Time limit set to', max_time if max_mem: ilp.parameters.mip.limits.treememory.set(max_mem) if verbose: print 'Max tree memory set to', max_mem patients = tumors2genes.keys() # The ILP needs a variable for each gene in each pathway it could be a # member of. Creates variable names of the form {gene}-{pathway number} # |all_names| = |genes| x paths l = map(lambda g: [g + "->" + str(i) for i in range(1, t + 1)], genes) all_names, over_is = [item for sublist in l for item in sublist], [] # initialize r variables rs = ["rvariable" + str(i) + "-" + str(j) for i in range(len(patients)) for j in range(1, t + 1)] rs = [] for j in range(1, t + 1): rs.append(["rvariable-" + str(i) + "-" + str(j) for i in range(len(patients))]) flat_rs = [item for sublist in rs for item in sublist] rs_coefs = [1.0 + alpha] * len(flat_rs) C = 65536 # Encode W' as the objective function for v in all_names: my_obj.append(len(genes2tumors[var_to_gene(v)]) * -alpha) # append r vars to the set of variables and r vars' coefficients to the obj # function. genes' vars are already initialized to |genes2tumors[g]| my_obj += rs_coefs all_names += flat_rs if t > 1: for i in range(1, t + 1): for j in range(i + 1, t + 1): over_is += [g + ".." + str(i) + ".." + str(j) for g in genes] all_names += over_is my_obj += [0.0] * len(over_is) # initialize vars and their bounds {0,1} lbs, ubs = [0] * len(all_names), [1] * len(all_names) my_types = [ilp.variables.type.integer] * len(all_names) ilp.variables.add(names = all_names, ub = ubs, lb = lbs, types = my_types, obj = my_obj) ilp.objective.set_sense(ilp.objective.sense.maximize) # add pathway size constraints, if any for i in range(1, t + 1): my_names = [g + "->" + str(i) for g in genes] size_coefs = [1.0] * len(my_names) if k_max != 0 and k_min == k_max: ilp.linear_constraints.add(senses = "E", rhs = [k_min], names = ["eq" + str(i)]) ilp.linear_constraints.set_linear_components( [("eq" + str(i), [my_names, size_coefs])]) elif type([]) == type(k_max): ilp.linear_constraints.add(senses = "E", rhs = [k_max[i-1]], names = ["eq" + str(i)]) ilp.linear_constraints.set_linear_components( [("eq" + str(i), [my_names, size_coefs])]) else: ilp.linear_constraints.add(senses = "L", rhs = [k_max], names = ["max" + str(i)]) ilp.linear_constraints.set_linear_components( [("max" + str(i), [my_names, size_coefs])]) ilp.linear_constraints.add(senses = "G", rhs = [k_min], names = ["min" + str(i)]) ilp.linear_constraints.set_linear_components( [("min" + str(i), [my_names, size_coefs])]) # add non-overlapping pathways constraint for g in genes: my_names = [g + "->" + str(i) for i in range(1, t + 1)] coefficients = [1] * len(my_names) ilp.linear_constraints.add(senses = my_sense, rhs = [t_per_gene], names = [g]) ilp.linear_constraints.set_linear_components([(g, [my_names, coefficients])]) # add r constraints for i in range(len(patients)): coefficients = [V(tumors2genes, g, patients[i]) for g in genes] my_coefficients = coefficients + [-1.0] for j in range(1, t + 1): r = rs[j-1][i] my_names = [g + "->" + str(j) for g in genes] + [r] ilp.linear_constraints.add(senses = "G", rhs = [0.0], names = [str(i) + str(j) + "-rn"]) ilp.linear_constraints.set_linear_components( [(str(i) + str(j) + "-rn", [my_names, my_coefficients])]) # allow at most one overlap per pathway, if there's more than one pathway ########################################################################### if t > 1 and overlaps > 0: # first add pairwise pathway indicators for ind in over_is: g, i, j = ind.split("..")[0], ind.split("..")[1], ind.split("..")[2] my_names = [g + "->" + i, g + "->" + j, ind] coefficients = [1.0, 1.0, -1.0 * (C+1)] ilp.linear_constraints.add(senses = "G", rhs = [C * -1.0 + 1.0], names = [g + i + j + "1"]) ilp.linear_constraints.set_linear_components( [(g + i + j + "1", [my_names, coefficients])]) coefficients = [1.0, 1.0, -1.0 * C] ilp.linear_constraints.add(senses = "L", rhs = [1.0], names = [g + i + j + "2"]) ilp.linear_constraints.set_linear_components( [(g + i + j + "2", [my_names, coefficients])]) # then constrain pairwise pathway indicators so only one gene can # be shared between two pathways for i in range(1, t + 1): for j in range(i + 1, t + 1): my_names = [g + ".." + str(i) + ".." + str(j) for g in genes] coefficients = [1.0] * len(my_names) ilp.linear_constraints.add(senses = "L", rhs = [overlaps], names = ["k" + str(i) + str(j)]) ilp.linear_constraints.set_linear_components( [("k" + str(i) + str(j), [my_names, coefficients])]) if verbose: print "done!" ########################################################################### # Solve ILP and convert solution back to genes update_stdout("Solving ILP...", verbose) ilp.solve() if verbose: print "done!" ans = ilp.solution.get_values() pathways = [[] for i in range(t)] for var, val in zip(all_names, ans): if str(val) == "1.0" or float(val) > 0.5: # be sure to ignore dummy variables if var.count("rvariable") == 0 and var.count("..") == 0: pathways[path_num(var)].append(var_to_gene(var)) # sort each pathway by the highest to lowest coverage genes for p in pathways: p.sort(key=lambda g: len(genes2tumors[g]), reverse=True) # sort pathways by weight (highest first) for output pathways_with_weights = [(p, W(genes2tumors, p, alpha)) for p in pathways] pathways_with_weights.sort(key=lambda (p, w): w) pathways_with_weights.reverse() # Output run summary if verbose if verbose: print "done!\n" print "------------- Multi-Dendrix Run Summary ----------------" print "Num Pathways:", t, "\nMin:", k_min, "\nMax:", k_max print "Overlaps:", overlaps, "\nMax Paths / Gene:", t_per_gene print "Alpha:", alpha print "\nPathways:" W_prime = 0 for i in range(1, len(pathways_with_weights) + 1): p, w = pathways_with_weights[i-1] print "\t", ", ".join(p) W_prime += w print "\tW(M) =", w print "W'(M):", W_prime print "\n-----------------------------------------------------\n" return pathways_with_weights ################################################################################ ####### Helper functions for running Multi-Dendrix and analyzing results ####### # Weight function from Dendrix (Vandin et al. (2012) Genome Research 22:375-85).
[docs]def W(genes2tumors, genes, alpha): '''Given a set of genes, calculates their weight W, defined as the difference between their total coverage and coverage overlap Input: genes2tumors mapping, set of genes Output: weight W ''' tumors, coverage_overlap = set(), 0 for gene in genes: targets = genes2tumors[gene] tumors = tumors.union(targets) coverage_overlap += len(targets) coverage = len(tumors) return int( (1+alpha) * coverage - alpha * coverage_overlap )
[docs]def load_db_with_cutoff(file, patient_whitelist=None, gene_whitelist=None, cutoff=0): mutation_data = load_mutation_data(file, patient_whitelist, gene_whitelist) m, n, genespace, patientspace, G2T, T2G = mutation_data if cutoff == 0: return m, n, genespace, patientspace, G2T, T2G for gene in genespace: if len(G2T[gene]) < cutoff: for patient in G2T[gene]: T2G[patient].remove(gene) del G2T[gene] m, genespace = len(G2T.keys()), G2T.keys() return m, n, genespace, patientspace, G2T, T2G
[docs]def load_mutation_data(file, patient_whitelist=None, gene_whitelist=None, tcga_data=True): '''Function to load mutation data. Input: one tab-delimited line per patient, with the patient ID followed by each gene mutated in the patient. Output: genes - set of genes used in this study tumors2genes- maps tumors to their mutations genes2tumors - maps genes to the tumors in which they are mutated ''' T2G, G2T, tumors = {}, {}, set() for line in [l.rstrip() for l in open(file) if not l.startswith('#')]: arr = filter(lambda g: g != "", line.split('\t')) tumor, mutations = arr[0], arr[1:] # only need the first three parts of the ID for TCGA data if tcga_data: tumor = '-'.join(tumor.split('-')[:3]) if gene_whitelist: mutations = filter(lambda g: gene_whitelist[g], mutations) if patient_whitelist and not patient_whitelist[tumor]: continue T2G[tumor] = mutations for gene in mutations: if not gene in G2T.keys(): G2T[gene] = set([tumor]) else: G2T[gene].add(tumor) genespace, patientspace = G2T.keys(), T2G.keys() m, n = len(genespace), len(patientspace) return m, n, genespace, patientspace, G2T, T2G
def parse_args(input_list=None): # Parse arguments import argparse class Args: pass args = Args() description = 'Runs Multi-Dendrix to find the optimal set of t pathways '\ 'of k genes for the weight function W\'.' parser = argparse.ArgumentParser(description=description) parser.add_argument('-m', '--mutation_matrix', required=True, help='File name for mutation data.') parser.add_argument('-c', '--cutoff', type=int, default=0, help='Minimum gene mutation frequency.') parser.add_argument('-p', '--patient_whitelist', default=None, help='File of patients to be included.') parser.add_argument('-bp', '--patient_blacklist', default=None, help='File of patients to be excluded.') parser.add_argument('-g', '--gene_whitelist', default=None, help='File of genes to be included.') parser.add_argument('-bg', '--gene_blacklist', default=None, help='File of genes to be excluded.') parser.add_argument('-t', '--num_pathways', required=True, type=int, help='Desired number of pathways.') parser.add_argument('-k_min', '--min_pathway_size', type=int, help='Minimum pathway size.') parser.add_argument('-k_max', '--max_pathway_size', type=int, help='Maximum pathway size.') parser.add_argument('-k', '--pathway_size', type=int, default=None, help='Pathway size.') parser.add_argument('-a', '--alpha', type=float, default=1.0, help='Parameter that changes weight function W by chang'\ 'ing the tradeoff between coverage and exclusivity') parser.add_argument('-o', '--output_file', default=None, help='Name of output file.') parser.add_argument('-d', '--num_overlaps', type=int, default=0, help='Number of overlaps allowed per pathway.') parser.add_argument('-l', '--pathways_per_gene', type=int, default=1, help='Number of pathways a gene is allowed to be in.') parser.add_argument('--max_time', type=int, default=1e75, help='Maximum amount of time to be spent on an optimization.') parser.add_argument('--max_mem', type=int, default=1e75, help='Maximum tree memory to be used by CPLEX.') parser.add_argument('-q', '--quiet', default=False, action="store_true", help='Quiet output flag.') if input_list: parser.parse_args(input_list, namespace=args) else: parser.parse_args(namespace=args) return args
[docs]def white_and_blacklisting(args, tcga_data=True): # Blacklisting and whitelisting works as follows. If a whitelist is passed in, # only genes/samples in the whitelist and NOT in the blacklist are allowed. If # only a blacklist is passed in, all genes/samples not on the blacklist are # allowed. from collections import defaultdict if args.patient_whitelist: sample2include = defaultdict(lambda : False) patient_whitelist = [l.rstrip().split()[:2] for l in open(args.patient_whitelist) if not l.startswith("#")] if len(patient_whitelist[0]) > 1: samples2types = dict(patient_whitelist) else: samples2types = dict([(patient[0], args.db_name) for patient in patient_whitelist]) # only need the first three parts of the ID for TCGA data if tcga_data: samples2types = dict([('-'.join(p.split('-')[:3]), samples2types[p]) for p in samples2types.keys()]) patient_whitelist = samples2types.keys() for p in patient_whitelist: sample2include[p] = True else: patient_whitelist, samples2types, sample2include = None, None, defaultdict(lambda : True) if args.patient_blacklist: patient_blacklist = [l.rstrip() for l in open(args.patient_blacklist)] for p in patient_blacklist: sample2include[p] = False if args.gene_whitelist: gene2include = defaultdict(lambda : False) gene_whitelist = set([l.split()[0] for l in open(args.gene_whitelist)]) for g in gene_whitelist: gene2include[g] = True else: gene_whitelist, gene2include = None, defaultdict(lambda : True) if args.gene_blacklist: gene_blacklist = [l.rstrip() for l in open(args.gene_blacklist)] for g in gene_blacklist: gene2include[g] = False return gene2include, sample2include, samples2types
def run(args): gene2include, sample2include, samples2types = white_and_blacklisting(args) mutation_data = load_db_with_cutoff(args.mutation_matrix, sample2include, gene2include, args.cutoff) m, n, genespace, patientspace, G2T, T2G = mutation_data # Run Multi-Dendrix if args.pathway_size: k_min, k_max = args.pathway_size, args.pathway_size else: k_min, k_max = args.min_pathway_size, args.max_pathway_size pathways_with_weights = multi_dendrix((genespace, T2G, G2T), args.num_pathways, k_min, k_max, args.alpha, args.num_overlaps, args.pathways_per_gene, max_mem=args.max_mem, max_time=args.max_time, verbose=not args.quiet) # Output results rows = [str(weight) + ' ' + ' '.join([g + '[' + ','.join(G2T[g]) + ']' for g in pathway]) for pathway, weight in pathways_with_weights] if args.output_file: open(args.output_file, 'w').write('\n'.join(rows)) return pathways_with_weights if __name__ == "__main__": args = parse_args() run(args)