Source code for multi_dendrix

#!/usr/bin/env python
import sys
try: import cplex
except ImportError:
    print "Couldn't import CPLEX. Check your environment's PYTHONPATH "\
          "variable. For details on the CPLEX python module, please visit "\
          "http://bit.ly/KL7PVc."
    sys.exit(3)

########################## Multi-Dendrix ILP ###################################
# Helper functions for the multi_dendrix ILP function
[docs]def V(tumors2genes, g, tumor):
    '''Returns true if gene g is mutated in the given tumor'''
    return int(g in tumors2genes[tumor])

[docs]def var_to_gene(var):
    '''Extracts the gene name from the variable name used in the ILP'''
    if var.count("->") > 0: return var.split("->")[0]
    else: return var
    
[docs]def path_num(var):
    '''Extracts the pathway number from a variable name, and corrects the index'''
    if var.count("->") > 0:
        try: return int(var.split("->")[1]) - 1
        except ValueError: return 0
    else: return 0

[docs]def update_stdout(message, verbose):
    '''Logs progress for ILP'''
    if verbose:
        print message,
        sys.stdout.flush()

# CPLEX memory options (see http://goo.gl/dJV6F)
MEMORY_ONLY        = 1
ON_DISK_COMPRESSED = 3

# Main function for Multi-Dendrix ILP
[docs]def multi_dendrix((genes, tumors2genes, genes2tumors), t=0, k_min=0,
                  k_max=cplex.infinity, alpha=1.0, overlaps=0.0, t_per_gene=1.0,
                  verbose=False, max_mem=None, max_time=None):
    ''' Implementation of Multi-Dendrix ILP. Requires installation of CPLEX Python.
    Input: genes, tumors2genes, genes2tumors - mutation data from
                                               load_mutation_data() below
    Options: t     - number of pathways
             k_min - minimum pathway size
             k_max - maximum pathway size
             alpha - parameter that changes the weight function W by changing
                     the tradeoff between coverage and coverage overlap
             overlaps - number of overlaps allowed between any pair of pathways
             t_per_gene - number of pathways any gene can be a member of
             verbose - outputs results to stdout
    Output: Writes pathways to file called "multi_dendrix_pathways.txt".
            Each line lists the weight and genes for each pathway.
    '''
    update_stdout("Initializing ILP...", verbose)
    # Default parameters for ILP
    try: ilp, my_obj, my_sense, my_rhs = cplex.Cplex(), [], "L", [1]
    except cplex.exceptions.CplexSolverError:
        print 'Exiting because of CPLEX license issue.'
        sys.exit(2)

    # Set params for ILP
    ilp.set_results_stream(None) # Change this to have CPLEX output to stdout
    ilp.parameters.mip.strategy.file.set(MEMORY_ONLY)
    #if max_time:
    #    ilp.parameters.timelimit.set(max_time)
    #    if verbose: print 'Time limit set to', max_time
    if max_mem:
        ilp.parameters.mip.limits.treememory.set(max_mem)
        if verbose: print 'Max tree memory set to', max_mem

    patients = tumors2genes.keys()

    # The ILP needs a variable for each gene in each pathway it could be a
    # member of. Creates variable names of the form  {gene}-{pathway number}
    # |all_names| = |genes| x paths
    l = map(lambda g: [g + "->" + str(i) for i in range(1, t + 1)], genes)
    all_names, over_is = [item for sublist in l for item in sublist], []

    # initialize r variables
    rs = ["rvariable" + str(i) + "-" + str(j) for i in range(len(patients))
          for j in range(1, t + 1)]
    rs = []
    for j in range(1, t + 1):
        rs.append(["rvariable-" + str(i) + "-" + str(j)
                   for i in range(len(patients))])
    flat_rs =  [item for sublist in rs for item in sublist]
    rs_coefs = [1.0 + alpha] * len(flat_rs)
    C = 65536

    # Encode W' as the objective function
    for v in all_names:
        my_obj.append(len(genes2tumors[var_to_gene(v)]) * -alpha)

    # append r vars to the set of variables and r vars' coefficients to the obj
    # function. genes' vars are already initialized to |genes2tumors[g]|
    my_obj += rs_coefs
    all_names += flat_rs

    if t > 1:
        for i in range(1, t + 1):
            for j in range(i + 1, t + 1):
                over_is += [g + ".." + str(i) + ".." + str(j) for g in genes]
        all_names += over_is
        my_obj += [0.0] * len(over_is)

    # initialize vars and their bounds {0,1}
    lbs, ubs = [0] * len(all_names), [1] * len(all_names)
    my_types = [ilp.variables.type.integer] * len(all_names)
    ilp.variables.add(names = all_names, ub = ubs, lb = lbs,
                      types = my_types, obj = my_obj)
    ilp.objective.set_sense(ilp.objective.sense.maximize)

    # add pathway size constraints, if any
    for i in range(1, t + 1):
        my_names = [g + "->" + str(i) for g in genes]
        size_coefs = [1.0] * len(my_names)
        if k_max != 0 and k_min == k_max:
            ilp.linear_constraints.add(senses = "E", rhs = [k_min],
                                       names = ["eq" + str(i)])
            ilp.linear_constraints.set_linear_components(
                [("eq" + str(i), [my_names, size_coefs])])
        elif type([]) == type(k_max):
            ilp.linear_constraints.add(senses = "E", rhs = [k_max[i-1]],
                                       names = ["eq" + str(i)])
            ilp.linear_constraints.set_linear_components(
                [("eq" + str(i), [my_names, size_coefs])])  
        else:
            ilp.linear_constraints.add(senses = "L", rhs = [k_max],
                                       names = ["max" + str(i)])
            ilp.linear_constraints.set_linear_components(
                [("max" + str(i), [my_names, size_coefs])])
            ilp.linear_constraints.add(senses = "G", rhs = [k_min],
                                       names = ["min" + str(i)])
            ilp.linear_constraints.set_linear_components(
                [("min" + str(i), [my_names, size_coefs])])

    # add non-overlapping pathways constraint
    for g in genes:
        my_names = [g + "->"  + str(i) for i in range(1, t + 1)]
        coefficients = [1] * len(my_names)
        ilp.linear_constraints.add(senses = my_sense, rhs = [t_per_gene],
                                   names = [g])
        ilp.linear_constraints.set_linear_components([(g, [my_names,
                                                           coefficients])])

    # add r constraints
    for i in range(len(patients)):
        coefficients = [V(tumors2genes, g, patients[i]) for g in genes]
        my_coefficients = coefficients + [-1.0]
        for j in range(1, t + 1):
            r = rs[j-1][i]
            my_names = [g + "->" + str(j) for g in genes] + [r]
            ilp.linear_constraints.add(senses = "G", rhs = [0.0],
                                       names = [str(i) + str(j) + "-rn"])
            ilp.linear_constraints.set_linear_components(
                [(str(i) + str(j) + "-rn", [my_names, my_coefficients])])

    # allow at most one overlap per pathway, if there's more than one pathway
    ###########################################################################
    if t > 1 and overlaps > 0:
        # first add pairwise pathway indicators
        for ind in over_is:
            g, i, j = ind.split("..")[0], ind.split("..")[1], ind.split("..")[2]
            my_names = [g + "->"  + i, g + "->" + j, ind]
            coefficients = [1.0, 1.0, -1.0 * (C+1)]
            ilp.linear_constraints.add(senses = "G", rhs = [C * -1.0 + 1.0],
                                       names = [g + i + j + "1"])
            ilp.linear_constraints.set_linear_components(
                [(g + i + j + "1", [my_names, coefficients])])
            coefficients = [1.0, 1.0, -1.0 * C]
            ilp.linear_constraints.add(senses = "L", rhs = [1.0],
                                       names = [g + i + j + "2"])
            ilp.linear_constraints.set_linear_components(
                [(g + i + j + "2", [my_names, coefficients])])

        # then constrain pairwise pathway indicators so only one gene can
        # be shared between two pathways
        for i in range(1, t + 1):
            for j in range(i + 1, t + 1):
                my_names = [g + ".." + str(i) + ".." + str(j) for g in genes]
                coefficients = [1.0] * len(my_names)
                ilp.linear_constraints.add(senses = "L", rhs = [overlaps],
                                           names = ["k" + str(i) + str(j)])
                ilp.linear_constraints.set_linear_components(
                    [("k" + str(i) + str(j), [my_names, coefficients])])
    if verbose: print "done!"
                
    ###########################################################################

    # Solve ILP and convert solution back to genes
    update_stdout("Solving ILP...", verbose)
    ilp.solve()
    if verbose: print "done!"
    ans = ilp.solution.get_values()
    pathways = [[] for i in range(t)]
    for var, val in zip(all_names, ans):
        if str(val) == "1.0" or float(val) > 0.5:
            # be sure to ignore dummy variables
            if var.count("rvariable") == 0 and var.count("..") == 0:
                pathways[path_num(var)].append(var_to_gene(var))
    
    # sort each pathway by the highest to lowest coverage genes
    for p in pathways: p.sort(key=lambda g: len(genes2tumors[g]), reverse=True)
    
    # sort pathways by weight (highest first) for output
    pathways_with_weights = [(p, W(genes2tumors, p, alpha)) for p in pathways]
    pathways_with_weights.sort(key=lambda (p, w): w)
    pathways_with_weights.reverse()

    # Output run summary if verbose
    if verbose:
        print "done!\n"
        print "------------- Multi-Dendrix Run Summary ----------------"
        print "Num Pathways:", t, "\nMin:", k_min, "\nMax:", k_max
        print "Overlaps:", overlaps, "\nMax Paths / Gene:", t_per_gene
        print "Alpha:", alpha
        print "\nPathways:"
        W_prime = 0
        for i in range(1, len(pathways_with_weights) + 1):
            p, w = pathways_with_weights[i-1]
            print "\t", ", ".join(p)
            W_prime += w
            print "\tW(M) =", w
        print "W'(M):", W_prime
        print "\n-----------------------------------------------------\n"
    return pathways_with_weights

################################################################################

####### Helper functions for running Multi-Dendrix and analyzing results #######
# Weight function from Dendrix (Vandin et al. (2012) Genome Research 22:375-85).
[docs]def W(genes2tumors, genes, alpha):
    '''Given a set of genes, calculates their weight W, defined as the difference
    between their total coverage and coverage overlap
    Input: genes2tumors mapping, set of genes
    Output: weight W
    '''
    tumors, coverage_overlap = set(), 0
    for gene in genes:
        targets  = genes2tumors[gene]
        tumors = tumors.union(targets)
        coverage_overlap += len(targets)
    coverage = len(tumors)
    return int( (1+alpha) * coverage - alpha * coverage_overlap )

[docs]def load_db_with_cutoff(file, patient_whitelist=None, gene_whitelist=None,
                        cutoff=0):
    mutation_data = load_mutation_data(file, patient_whitelist, gene_whitelist)
    m, n, genespace, patientspace, G2T, T2G = mutation_data
    if cutoff == 0: return m, n, genespace, patientspace, G2T, T2G
    for gene in genespace:
        if len(G2T[gene]) < cutoff:
            for patient in G2T[gene]:
                T2G[patient].remove(gene)
            del G2T[gene]
    m, genespace = len(G2T.keys()), G2T.keys()
    return m, n, genespace, patientspace, G2T, T2G    

[docs]def load_mutation_data(file, patient_whitelist=None, gene_whitelist=None, tcga_data=True):
    '''Function to load mutation data.
    Input: one tab-delimited line per patient, with the
    patient ID followed by each gene mutated in the patient.
    Output: genes - set of genes used in this study
            tumors2genes- maps tumors to their mutations
            genes2tumors - maps genes to the tumors in which they are mutated
    '''
    T2G, G2T, tumors = {}, {}, set()
    for line in [l.rstrip() for l in open(file) if not l.startswith('#')]:
        arr = filter(lambda g: g != "", line.split('\t'))
        tumor, mutations = arr[0], arr[1:]
        # only need the first three parts of the ID for TCGA data
        if tcga_data: tumor = '-'.join(tumor.split('-')[:3])

        if gene_whitelist:
            mutations = filter(lambda g: gene_whitelist[g], mutations)
        if patient_whitelist and not patient_whitelist[tumor]: continue
        
        T2G[tumor] = mutations
        for gene in mutations:
            if not gene in G2T.keys(): G2T[gene] = set([tumor])
            else: G2T[gene].add(tumor)

    genespace, patientspace = G2T.keys(), T2G.keys()
    m, n = len(genespace), len(patientspace)
    return m, n, genespace, patientspace, G2T, T2G

def parse_args(input_list=None):
    # Parse arguments
    import argparse
    class Args: pass
    args = Args()
    description = 'Runs Multi-Dendrix to find the optimal set of t pathways '\
                  'of k genes for the weight function W\'.'
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('-m', '--mutation_matrix', required=True,
                        help='File name for mutation data.')
    parser.add_argument('-c', '--cutoff', type=int, default=0, 
                        help='Minimum gene mutation frequency.')
    parser.add_argument('-p', '--patient_whitelist', default=None,
                        help='File of patients to be included.')
    parser.add_argument('-bp', '--patient_blacklist', default=None,
                        help='File of patients to be excluded.')
    parser.add_argument('-g', '--gene_whitelist', default=None,
                        help='File of genes to be included.')
    parser.add_argument('-bg', '--gene_blacklist', default=None,
                        help='File of genes to be excluded.')
    parser.add_argument('-t', '--num_pathways', required=True, type=int,
                        help='Desired number of pathways.')
    parser.add_argument('-k_min', '--min_pathway_size', type=int,
                        help='Minimum pathway size.')
    parser.add_argument('-k_max', '--max_pathway_size', type=int,
                        help='Maximum pathway size.')
    parser.add_argument('-k', '--pathway_size', type=int, default=None,
                        help='Pathway size.')
    parser.add_argument('-a', '--alpha', type=float, default=1.0,
                        help='Parameter that changes weight function W by chang'\
                        'ing the tradeoff between coverage and exclusivity')
    parser.add_argument('-o', '--output_file', default=None,
                        help='Name of output file.')    
    parser.add_argument('-d', '--num_overlaps', type=int, default=0,
                        help='Number of overlaps allowed per pathway.')    
    parser.add_argument('-l', '--pathways_per_gene', type=int, default=1,
                        help='Number of pathways a gene is allowed to be in.') 
    parser.add_argument('--max_time', type=int, default=1e75,
                        help='Maximum amount of time to be spent on an optimization.')   
    parser.add_argument('--max_mem', type=int, default=1e75,
                        help='Maximum tree memory to be used by CPLEX.')
    parser.add_argument('-q', '--quiet', default=False, action="store_true",
                        help='Quiet output flag.')
    if input_list: parser.parse_args(input_list, namespace=args)
    else: parser.parse_args(namespace=args)
    return args

[docs]def white_and_blacklisting(args, tcga_data=True):
    # Blacklisting and whitelisting works as follows. If a whitelist is passed in,
    # only genes/samples in the whitelist and NOT in the blacklist are allowed. If
    # only a blacklist is passed in, all genes/samples not on the blacklist are
    # allowed.
    from collections import defaultdict
    if args.patient_whitelist:
        sample2include = defaultdict(lambda : False)
        patient_whitelist = [l.rstrip().split()[:2] for l in open(args.patient_whitelist)
                             if not l.startswith("#")]

        if len(patient_whitelist[0]) > 1: samples2types = dict(patient_whitelist)
        else:
            samples2types = dict([(patient[0], args.db_name)
                                  for patient in patient_whitelist])
        # only need the first three parts of the ID for TCGA data
        if tcga_data:
            samples2types = dict([('-'.join(p.split('-')[:3]), samples2types[p])
                                  for p in samples2types.keys()])

        patient_whitelist =  samples2types.keys()
        for p in patient_whitelist: sample2include[p] = True
    else: patient_whitelist, samples2types, sample2include = None, None, defaultdict(lambda : True)
    if args.patient_blacklist:
        patient_blacklist = [l.rstrip() for l in open(args.patient_blacklist)]
        for p in patient_blacklist: sample2include[p] = False

    if args.gene_whitelist:
        gene2include = defaultdict(lambda : False)
        gene_whitelist = set([l.split()[0] for l in open(args.gene_whitelist)])
        for g in gene_whitelist: gene2include[g] = True
    else: gene_whitelist, gene2include = None, defaultdict(lambda : True)
    if args.gene_blacklist:
        gene_blacklist = [l.rstrip() for l in open(args.gene_blacklist)]
        for g in gene_blacklist: gene2include[g] = False
    return gene2include, sample2include, samples2types

def run(args):
    gene2include, sample2include, samples2types = white_and_blacklisting(args)
    mutation_data = load_db_with_cutoff(args.mutation_matrix, sample2include,
                                       gene2include, args.cutoff)
    m, n, genespace, patientspace, G2T, T2G = mutation_data

    # Run Multi-Dendrix
    if args.pathway_size: k_min, k_max = args.pathway_size, args.pathway_size
    else: k_min, k_max = args.min_pathway_size, args.max_pathway_size
    pathways_with_weights = multi_dendrix((genespace, T2G, G2T), args.num_pathways,
                                          k_min, k_max, args.alpha,
                                          args.num_overlaps, args.pathways_per_gene,
                                          max_mem=args.max_mem, max_time=args.max_time,
                                          verbose=not args.quiet)

    # Output results
    rows = [str(weight) + ' ' + ' '.join([g + '[' + ','.join(G2T[g]) + ']'
                                          for g in pathway])
            for pathway, weight in pathways_with_weights]
    if args.output_file:
        open(args.output_file, 'w').write('\n'.join(rows))
    return pathways_with_weights

if __name__ == "__main__":
    args = parse_args()
    run(args)
Navigation

Quick search

Source code for multi_dendrix

Navigation