#!/usr/bin/env python
import sys
try: import cplex
except ImportError:
print "Couldn't import CPLEX. Check your environment's PYTHONPATH "\
"variable. For details on the CPLEX python module, please visit "\
"http://bit.ly/KL7PVc."
sys.exit(3)
########################## Multi-Dendrix ILP ###################################
# Helper functions for the multi_dendrix ILP function
[docs]def V(tumors2genes, g, tumor):
'''Returns true if gene g is mutated in the given tumor'''
return int(g in tumors2genes[tumor])
[docs]def var_to_gene(var):
'''Extracts the gene name from the variable name used in the ILP'''
if var.count("->") > 0: return var.split("->")[0]
else: return var
[docs]def path_num(var):
'''Extracts the pathway number from a variable name, and corrects the index'''
if var.count("->") > 0:
try: return int(var.split("->")[1]) - 1
except ValueError: return 0
else: return 0
[docs]def update_stdout(message, verbose):
'''Logs progress for ILP'''
if verbose:
print message,
sys.stdout.flush()
# CPLEX memory options (see http://goo.gl/dJV6F)
MEMORY_ONLY = 1
ON_DISK_COMPRESSED = 3
# Main function for Multi-Dendrix ILP
[docs]def multi_dendrix((genes, tumors2genes, genes2tumors), t=0, k_min=0,
k_max=cplex.infinity, alpha=1.0, overlaps=0.0, t_per_gene=1.0,
verbose=False, max_mem=None, max_time=None):
''' Implementation of Multi-Dendrix ILP. Requires installation of CPLEX Python.
Input: genes, tumors2genes, genes2tumors - mutation data from
load_mutation_data() below
Options: t - number of pathways
k_min - minimum pathway size
k_max - maximum pathway size
alpha - parameter that changes the weight function W by changing
the tradeoff between coverage and coverage overlap
overlaps - number of overlaps allowed between any pair of pathways
t_per_gene - number of pathways any gene can be a member of
verbose - outputs results to stdout
Output: Writes pathways to file called "multi_dendrix_pathways.txt".
Each line lists the weight and genes for each pathway.
'''
update_stdout("Initializing ILP...", verbose)
# Default parameters for ILP
try: ilp, my_obj, my_sense, my_rhs = cplex.Cplex(), [], "L", [1]
except cplex.exceptions.CplexSolverError:
print 'Exiting because of CPLEX license issue.'
sys.exit(2)
# Set params for ILP
ilp.set_results_stream(None) # Change this to have CPLEX output to stdout
ilp.parameters.mip.strategy.file.set(MEMORY_ONLY)
#if max_time:
# ilp.parameters.timelimit.set(max_time)
# if verbose: print 'Time limit set to', max_time
if max_mem:
ilp.parameters.mip.limits.treememory.set(max_mem)
if verbose: print 'Max tree memory set to', max_mem
patients = tumors2genes.keys()
# The ILP needs a variable for each gene in each pathway it could be a
# member of. Creates variable names of the form {gene}-{pathway number}
# |all_names| = |genes| x paths
l = map(lambda g: [g + "->" + str(i) for i in range(1, t + 1)], genes)
all_names, over_is = [item for sublist in l for item in sublist], []
# initialize r variables
rs = ["rvariable" + str(i) + "-" + str(j) for i in range(len(patients))
for j in range(1, t + 1)]
rs = []
for j in range(1, t + 1):
rs.append(["rvariable-" + str(i) + "-" + str(j)
for i in range(len(patients))])
flat_rs = [item for sublist in rs for item in sublist]
rs_coefs = [1.0 + alpha] * len(flat_rs)
C = 65536
# Encode W' as the objective function
for v in all_names:
my_obj.append(len(genes2tumors[var_to_gene(v)]) * -alpha)
# append r vars to the set of variables and r vars' coefficients to the obj
# function. genes' vars are already initialized to |genes2tumors[g]|
my_obj += rs_coefs
all_names += flat_rs
if t > 1:
for i in range(1, t + 1):
for j in range(i + 1, t + 1):
over_is += [g + ".." + str(i) + ".." + str(j) for g in genes]
all_names += over_is
my_obj += [0.0] * len(over_is)
# initialize vars and their bounds {0,1}
lbs, ubs = [0] * len(all_names), [1] * len(all_names)
my_types = [ilp.variables.type.integer] * len(all_names)
ilp.variables.add(names = all_names, ub = ubs, lb = lbs,
types = my_types, obj = my_obj)
ilp.objective.set_sense(ilp.objective.sense.maximize)
# add pathway size constraints, if any
for i in range(1, t + 1):
my_names = [g + "->" + str(i) for g in genes]
size_coefs = [1.0] * len(my_names)
if k_max != 0 and k_min == k_max:
ilp.linear_constraints.add(senses = "E", rhs = [k_min],
names = ["eq" + str(i)])
ilp.linear_constraints.set_linear_components(
[("eq" + str(i), [my_names, size_coefs])])
elif type([]) == type(k_max):
ilp.linear_constraints.add(senses = "E", rhs = [k_max[i-1]],
names = ["eq" + str(i)])
ilp.linear_constraints.set_linear_components(
[("eq" + str(i), [my_names, size_coefs])])
else:
ilp.linear_constraints.add(senses = "L", rhs = [k_max],
names = ["max" + str(i)])
ilp.linear_constraints.set_linear_components(
[("max" + str(i), [my_names, size_coefs])])
ilp.linear_constraints.add(senses = "G", rhs = [k_min],
names = ["min" + str(i)])
ilp.linear_constraints.set_linear_components(
[("min" + str(i), [my_names, size_coefs])])
# add non-overlapping pathways constraint
for g in genes:
my_names = [g + "->" + str(i) for i in range(1, t + 1)]
coefficients = [1] * len(my_names)
ilp.linear_constraints.add(senses = my_sense, rhs = [t_per_gene],
names = [g])
ilp.linear_constraints.set_linear_components([(g, [my_names,
coefficients])])
# add r constraints
for i in range(len(patients)):
coefficients = [V(tumors2genes, g, patients[i]) for g in genes]
my_coefficients = coefficients + [-1.0]
for j in range(1, t + 1):
r = rs[j-1][i]
my_names = [g + "->" + str(j) for g in genes] + [r]
ilp.linear_constraints.add(senses = "G", rhs = [0.0],
names = [str(i) + str(j) + "-rn"])
ilp.linear_constraints.set_linear_components(
[(str(i) + str(j) + "-rn", [my_names, my_coefficients])])
# allow at most one overlap per pathway, if there's more than one pathway
###########################################################################
if t > 1 and overlaps > 0:
# first add pairwise pathway indicators
for ind in over_is:
g, i, j = ind.split("..")[0], ind.split("..")[1], ind.split("..")[2]
my_names = [g + "->" + i, g + "->" + j, ind]
coefficients = [1.0, 1.0, -1.0 * (C+1)]
ilp.linear_constraints.add(senses = "G", rhs = [C * -1.0 + 1.0],
names = [g + i + j + "1"])
ilp.linear_constraints.set_linear_components(
[(g + i + j + "1", [my_names, coefficients])])
coefficients = [1.0, 1.0, -1.0 * C]
ilp.linear_constraints.add(senses = "L", rhs = [1.0],
names = [g + i + j + "2"])
ilp.linear_constraints.set_linear_components(
[(g + i + j + "2", [my_names, coefficients])])
# then constrain pairwise pathway indicators so only one gene can
# be shared between two pathways
for i in range(1, t + 1):
for j in range(i + 1, t + 1):
my_names = [g + ".." + str(i) + ".." + str(j) for g in genes]
coefficients = [1.0] * len(my_names)
ilp.linear_constraints.add(senses = "L", rhs = [overlaps],
names = ["k" + str(i) + str(j)])
ilp.linear_constraints.set_linear_components(
[("k" + str(i) + str(j), [my_names, coefficients])])
if verbose: print "done!"
###########################################################################
# Solve ILP and convert solution back to genes
update_stdout("Solving ILP...", verbose)
ilp.solve()
if verbose: print "done!"
ans = ilp.solution.get_values()
pathways = [[] for i in range(t)]
for var, val in zip(all_names, ans):
if str(val) == "1.0" or float(val) > 0.5:
# be sure to ignore dummy variables
if var.count("rvariable") == 0 and var.count("..") == 0:
pathways[path_num(var)].append(var_to_gene(var))
# sort each pathway by the highest to lowest coverage genes
for p in pathways: p.sort(key=lambda g: len(genes2tumors[g]), reverse=True)
# sort pathways by weight (highest first) for output
pathways_with_weights = [(p, W(genes2tumors, p, alpha)) for p in pathways]
pathways_with_weights.sort(key=lambda (p, w): w)
pathways_with_weights.reverse()
# Output run summary if verbose
if verbose:
print "done!\n"
print "------------- Multi-Dendrix Run Summary ----------------"
print "Num Pathways:", t, "\nMin:", k_min, "\nMax:", k_max
print "Overlaps:", overlaps, "\nMax Paths / Gene:", t_per_gene
print "Alpha:", alpha
print "\nPathways:"
W_prime = 0
for i in range(1, len(pathways_with_weights) + 1):
p, w = pathways_with_weights[i-1]
print "\t", ", ".join(p)
W_prime += w
print "\tW(M) =", w
print "W'(M):", W_prime
print "\n-----------------------------------------------------\n"
return pathways_with_weights
################################################################################
####### Helper functions for running Multi-Dendrix and analyzing results #######
# Weight function from Dendrix (Vandin et al. (2012) Genome Research 22:375-85).
[docs]def W(genes2tumors, genes, alpha):
'''Given a set of genes, calculates their weight W, defined as the difference
between their total coverage and coverage overlap
Input: genes2tumors mapping, set of genes
Output: weight W
'''
tumors, coverage_overlap = set(), 0
for gene in genes:
targets = genes2tumors[gene]
tumors = tumors.union(targets)
coverage_overlap += len(targets)
coverage = len(tumors)
return int( (1+alpha) * coverage - alpha * coverage_overlap )
[docs]def load_db_with_cutoff(file, patient_whitelist=None, gene_whitelist=None,
cutoff=0):
mutation_data = load_mutation_data(file, patient_whitelist, gene_whitelist)
m, n, genespace, patientspace, G2T, T2G = mutation_data
if cutoff == 0: return m, n, genespace, patientspace, G2T, T2G
for gene in genespace:
if len(G2T[gene]) < cutoff:
for patient in G2T[gene]:
T2G[patient].remove(gene)
del G2T[gene]
m, genespace = len(G2T.keys()), G2T.keys()
return m, n, genespace, patientspace, G2T, T2G
[docs]def load_mutation_data(file, patient_whitelist=None, gene_whitelist=None, tcga_data=True):
'''Function to load mutation data.
Input: one tab-delimited line per patient, with the
patient ID followed by each gene mutated in the patient.
Output: genes - set of genes used in this study
tumors2genes- maps tumors to their mutations
genes2tumors - maps genes to the tumors in which they are mutated
'''
T2G, G2T, tumors = {}, {}, set()
for line in [l.rstrip() for l in open(file) if not l.startswith('#')]:
arr = filter(lambda g: g != "", line.split('\t'))
tumor, mutations = arr[0], arr[1:]
# only need the first three parts of the ID for TCGA data
if tcga_data: tumor = '-'.join(tumor.split('-')[:3])
if gene_whitelist:
mutations = filter(lambda g: gene_whitelist[g], mutations)
if patient_whitelist and not patient_whitelist[tumor]: continue
T2G[tumor] = mutations
for gene in mutations:
if not gene in G2T.keys(): G2T[gene] = set([tumor])
else: G2T[gene].add(tumor)
genespace, patientspace = G2T.keys(), T2G.keys()
m, n = len(genespace), len(patientspace)
return m, n, genespace, patientspace, G2T, T2G
def parse_args(input_list=None):
# Parse arguments
import argparse
class Args: pass
args = Args()
description = 'Runs Multi-Dendrix to find the optimal set of t pathways '\
'of k genes for the weight function W\'.'
parser = argparse.ArgumentParser(description=description)
parser.add_argument('-m', '--mutation_matrix', required=True,
help='File name for mutation data.')
parser.add_argument('-c', '--cutoff', type=int, default=0,
help='Minimum gene mutation frequency.')
parser.add_argument('-p', '--patient_whitelist', default=None,
help='File of patients to be included.')
parser.add_argument('-bp', '--patient_blacklist', default=None,
help='File of patients to be excluded.')
parser.add_argument('-g', '--gene_whitelist', default=None,
help='File of genes to be included.')
parser.add_argument('-bg', '--gene_blacklist', default=None,
help='File of genes to be excluded.')
parser.add_argument('-t', '--num_pathways', required=True, type=int,
help='Desired number of pathways.')
parser.add_argument('-k_min', '--min_pathway_size', type=int,
help='Minimum pathway size.')
parser.add_argument('-k_max', '--max_pathway_size', type=int,
help='Maximum pathway size.')
parser.add_argument('-k', '--pathway_size', type=int, default=None,
help='Pathway size.')
parser.add_argument('-a', '--alpha', type=float, default=1.0,
help='Parameter that changes weight function W by chang'\
'ing the tradeoff between coverage and exclusivity')
parser.add_argument('-o', '--output_file', default=None,
help='Name of output file.')
parser.add_argument('-d', '--num_overlaps', type=int, default=0,
help='Number of overlaps allowed per pathway.')
parser.add_argument('-l', '--pathways_per_gene', type=int, default=1,
help='Number of pathways a gene is allowed to be in.')
parser.add_argument('--max_time', type=int, default=1e75,
help='Maximum amount of time to be spent on an optimization.')
parser.add_argument('--max_mem', type=int, default=1e75,
help='Maximum tree memory to be used by CPLEX.')
parser.add_argument('-q', '--quiet', default=False, action="store_true",
help='Quiet output flag.')
if input_list: parser.parse_args(input_list, namespace=args)
else: parser.parse_args(namespace=args)
return args
[docs]def white_and_blacklisting(args, tcga_data=True):
# Blacklisting and whitelisting works as follows. If a whitelist is passed in,
# only genes/samples in the whitelist and NOT in the blacklist are allowed. If
# only a blacklist is passed in, all genes/samples not on the blacklist are
# allowed.
from collections import defaultdict
if args.patient_whitelist:
sample2include = defaultdict(lambda : False)
patient_whitelist = [l.rstrip().split()[:2] for l in open(args.patient_whitelist)
if not l.startswith("#")]
if len(patient_whitelist[0]) > 1: samples2types = dict(patient_whitelist)
else:
samples2types = dict([(patient[0], args.db_name)
for patient in patient_whitelist])
# only need the first three parts of the ID for TCGA data
if tcga_data:
samples2types = dict([('-'.join(p.split('-')[:3]), samples2types[p])
for p in samples2types.keys()])
patient_whitelist = samples2types.keys()
for p in patient_whitelist: sample2include[p] = True
else: patient_whitelist, samples2types, sample2include = None, None, defaultdict(lambda : True)
if args.patient_blacklist:
patient_blacklist = [l.rstrip() for l in open(args.patient_blacklist)]
for p in patient_blacklist: sample2include[p] = False
if args.gene_whitelist:
gene2include = defaultdict(lambda : False)
gene_whitelist = set([l.split()[0] for l in open(args.gene_whitelist)])
for g in gene_whitelist: gene2include[g] = True
else: gene_whitelist, gene2include = None, defaultdict(lambda : True)
if args.gene_blacklist:
gene_blacklist = [l.rstrip() for l in open(args.gene_blacklist)]
for g in gene_blacklist: gene2include[g] = False
return gene2include, sample2include, samples2types
def run(args):
gene2include, sample2include, samples2types = white_and_blacklisting(args)
mutation_data = load_db_with_cutoff(args.mutation_matrix, sample2include,
gene2include, args.cutoff)
m, n, genespace, patientspace, G2T, T2G = mutation_data
# Run Multi-Dendrix
if args.pathway_size: k_min, k_max = args.pathway_size, args.pathway_size
else: k_min, k_max = args.min_pathway_size, args.max_pathway_size
pathways_with_weights = multi_dendrix((genespace, T2G, G2T), args.num_pathways,
k_min, k_max, args.alpha,
args.num_overlaps, args.pathways_per_gene,
max_mem=args.max_mem, max_time=args.max_time,
verbose=not args.quiet)
# Output results
rows = [str(weight) + ' ' + ' '.join([g + '[' + ','.join(G2T[g]) + ']'
for g in pathway])
for pathway, weight in pathways_with_weights]
if args.output_file:
open(args.output_file, 'w').write('\n'.join(rows))
return pathways_with_weights
if __name__ == "__main__":
args = parse_args()
run(args)