#!/usr/bin/python
# Import globally required modules
import sys, random
try: import networkx as nx
except ImportError:
print 'Error!'
print '\tCould not import NetworkX (http://networkx.github.com).'
print '\tMake sure NetworkX is in your path.'
sys.exit(1)
from multi_dendrix import *
# Parse args
def parse_args(input_list=None):
# Parse arguments
import argparse
class Args: pass
args = Args()
description = 'Creates permuted matrices for a given set of Multi-Dendrix'\
' mutation data parameters using the MeMO permutation '\
'method. Requires NetworkX.'
parser = argparse.ArgumentParser(description=description)
parser.add_argument('-m', '--mutation_matrix', required=True,
help='File name for mutation data.')
parser.add_argument('-c', '--cutoff', type=int, default=0,
help='Minimum gene mutation frequency.')
parser.add_argument('-p', '--patient_whitelist', default=None,
help='File of patients to be included.')
parser.add_argument('-bp', '--patient_blacklist', default=None,
help='File of patients to be excluded.')
parser.add_argument('-g', '--gene_whitelist', default=None,
help='File of genes to be included.')
parser.add_argument('-bg', '--gene_blacklist', default=None,
help='File of genes to be excluded.')
parser.add_argument('-o', '--output_dir', required=True,
help='Name of output directory.')
parser.add_argument('-s', '--start_index', default=1, type=int,
help='Start index for name of permuted matrices.')
parser.add_argument('-n', '--num_matrices', type=int, default=100,
help='Number of overlaps allowed per pathway.')
parser.add_argument('-q', '--Q', type=int, default=100,
help='Edge swapping parameter.')
parser.add_argument('-v', '--verbose', default=False, action='store_true',
help='Flag verbose mode.')
# If called from the command line, parse command line args.
if input_list: parser.parse_args(input_list, namespace=args)
else: parser.parse_args(namespace=args)
return args
def log(s):
sys.stdout.write(s)
sys.stdout.flush()
[docs]def bipartite_double_edge_swap(G, genes, patients, nswap=1, max_tries=1e75):
'''A slightly modified version of the double_edge_swap function
in NetworkX to preserve the bipartite structure of the graph.'''
if nswap>max_tries:
raise nx.NetworkXError("Number of swaps > number of tries allowed.")
if len(G) < 4:
raise nx.NetworkXError("Graph has less than four nodes.")
# Instead of choosing uniformly at random from a generated edge list,
# this algorithm chooses nonuniformly from the set of nodes with
# probability weighted by degree.
n=0
swapcount=0
keys,degrees=zip(*G.degree().items()) # keys, degree
cdf=nx.utils.cumulative_distribution(degrees) # cdf of degree
while swapcount < nswap:
# pick two random edges without creating edge list
# choose source node indices from discrete distribution
(ui,xi)=nx.utils.discrete_sequence(2,cdistribution=cdf)
if ui==xi:
continue # same source, skip
u=keys[ui] # convert index to label
x=keys[xi]
if (u in genes and x in genes) or (u in patients and x in patients):
continue # both genes, skip
patient1 = u if u in patients else x
gene1 = x if x in genes else u
# choose target uniformly from neighbors
patient2=random.choice( list(G[gene1]) )
gene2=random.choice( list(G[patient1]) )
# don't create parallel edges
if (gene1 not in G[patient1]) and (gene2 not in G[patient2]):
G.add_edge(gene1,patient1)
G.add_edge(gene2,patient2)
G.remove_edge(gene1,patient2)
G.remove_edge(patient1, gene2)
swapcount+=1
if n >= max_tries:
e=('Maximum number of swap attempts (%s) exceeded '%n +
'before desired swaps achieved (%s).'%nswap)
raise nx.NetworkXAlgorithmError(e)
n+=1
return G
[docs]def construct_mutation_graph(G2T, T2G):
nodes = G2T.keys() + T2G.keys()
edges = [ (gene, patient) for gene in G2T.keys() for patient in G2T[gene] ]
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)
return G
[docs]def graph_to_mutation_data(H, genes, patients):
G2T, T2G = dict([(g, set()) for g in genes]), dict( )
for patient in patients:
mutations = H[patient]
T2G[patient] = set( mutations )
for g in mutations: G2T[g].add( patient )
genes, patients = G2T.keys(), T2G.keys()
return len(genes), len(patients), genes, patients, G2T, T2G
[docs]def permute_mutation_data(G, genes, patients, Q=100):
H = G.copy()
bipartite_double_edge_swap(H, genes, patients, nswap=Q * len( G.edges() ))
return graph_to_mutation_data(H, genes, patients)
[docs]def run(args):
# Load mutation data using Multi-Dendrix and output as a temporary file
if args.verbose: log('Loading mutation data...')
gene2include, sample2include, samples2types = white_and_blacklisting(args)
mutation_data = load_db_with_cutoff(args.mutation_matrix, sample2include,
gene2include, args.cutoff)
m, n, genespace, patientspace, G2T, T2G = mutation_data
if args.verbose: log('done!\n\n')
# Make sure output directory exists
import os
os.system('mkdir -p ' + args.output_dir)
# Construct bipartite graph from mutation data
if args.verbose: log('Creating bipartite graph...')
G = construct_mutation_graph(G2T, T2G)
if args.verbose:
log('done!\n\n')
print 'Graph has', len( G.edges() ), 'edges among', len( G.nodes() ), 'nodes.\n'
# Create permuted matrices and save to file
for i in range(args.num_matrices):
if args.verbose: log('+')
# Permute bipartite graph and output as a patient adjacency list
mutation_data = permute_mutation_data(G, genespace, patientspace, args.Q)
m, n, genespace, patientspace, G2T, T2G = mutation_data
adj_list = [ p + "\t" + "\t".join( T2G[p] ) for p in patientspace ]
filename = args.output_dir + "/" + str(i + args.start_index) + '.txt'
open(filename, 'w').write('\n'.join(adj_list))
if args.verbose: print
if __name__ == "__main__": run(parse_args())