#!/usr/bin/python
# Load required modules
# Try and load scipy's fisher's exact test function
import sys
try:
from scipy.stats import fisher_exact as pvalue
def fisher_exact(tbl): return pvalue(tbl, alternative='greater')[1]
except ImportError:
try:
from fisher import pvalue as pvalue
def fisher_exact(tbl): return pvalue(*tbl).right_tail
except ImportError:
print 'Fatal Error: Neither SciPyv0.11 or fisher0.1.4 modules '\
'(http://goo.gl/zYrLr) are installed.'
sys.exit(1)
def parse_args(input_list=None):
# Parse arguments
import argparse
class Args: pass
args = Args()
description = 'Calculates whether any genes are subtype specific for the '\
'given mutation data.'
parser = argparse.ArgumentParser(description=description)
parser.add_argument('-m', '--mutation_matrix', required=True,
help='File name for mutation data.')
parser.add_argument('-c', '--cutoff', type=int, default=0,
help='Minimum gene mutation frequency.')
parser.add_argument('-p', '--patient_whitelist', required=True,
help='Space-separated file of patient IDs and their '\
'(sub)type to be tested against.')
parser.add_argument('-bp', '--patient_blacklist', default=None,
help='File of patients to be excluded.')
parser.add_argument('-g', '--gene_whitelist', default=None,
help='File of genes to be included.')
parser.add_argument('-bg', '--gene_blacklist', default=None,
help='File of genes to be excluded.')
parser.add_argument('-o', '--output_file', default=None,
help='Name of output file.')
parser.add_argument('--sig_threshold', default=0.05, type=float,
help='Significance threshold.')
parser.add_argument('-a', '--all', default=False, action='store_true',
help='Flag to output all associations.')
parser.add_argument('-v', '--verbose', default=False, action='store_true',
help='Flag verbose mode.')
# If called from the command line, parse command line args.
if input_list: parser.parse_args(input_list, namespace=args)
else: parser.parse_args(namespace=args)
return args
[docs]def ty_contingency_table(ty, ty2mutations, tys, ty2numsamples):
type_mutations = len(ty2mutations[ty])
non_type_mutations = sum([len(ty2mutations[ty2]) for ty2 in tys if ty != ty2])
type_normal = ty2numsamples[ty] - len(ty2mutations[ty])
non_type_normal = sum([ty2numsamples[ty2] - len(ty2mutations[ty2]) for ty2 in tys if ty != ty2])
return type_mutations, type_normal, non_type_mutations, non_type_normal
[docs]def subtype_specificity(gene, sample2ty, ty2numsamples, G2T):
tys = sorted(ty2numsamples.keys())
num_tests = float(len(tys))
# Count the number of mutations in the subgraph in each cancer type
ty2mutations = dict([(ty, set()) for ty in tys])
for sample in G2T[gene]:
try: ty2mutations[sample2ty[sample]].add( sample )
except KeyError: continue # ignore samples with no type
h = dict()
for ty in tys:
cont_table = ty_contingency_table(ty, ty2mutations, tys, ty2numsamples)
pval = fisher_exact(cont_table)
corrected_pval = pval * num_tests if pval * num_tests <=1 else 1
type_mutations, type_normal, non_type_mutations, non_type_normal = cont_table
# Results
h[ty] = [ type_mutations, type_normal, non_type_mutations,
non_type_normal, pval, corrected_pval ]
return h
[docs]def load_sample2ty_file(sample2ty_file):
sample2ty = dict([l.rstrip().split("\t")[:2] for l in open(sample2ty_file)
if not l.startswith("#") ])
tys = sorted(set( [ty for sample, ty in sample2ty.iteritems()]))
return sample2ty, tys
[docs]def keep_significant(gene2specificity, threshold):
sig_gene2specificity = dict()
for g, ty2analysis in gene2specificity.iteritems():
h = dict()
for ty, analysis in ty2analysis.iteritems():
if analysis[-1] < threshold: # the corrected pvalue
h[ty] = analysis
if h.keys() != []:
sig_gene2specificity[g] = h
return sig_gene2specificity
[docs]def create_subtype_tbl(gene2specificity):
header = ['Gene', '(Sub)Type', 'Type_Mutations', 'Type_Normal',
'Non_Type_Mutations', 'Non_Type_Normal', 'P_Value',
'Bonferonni_Corrected_P_Value' ]
tbl = [ ]
for gene, ty2analysis in gene2specificity.iteritems():
for ty, analysis in ty2analysis.iteritems():
tbl.append( [gene, ty] + analysis )
# Sort rows by (sub)type, then p-value, then gene name
tbl.sort(key=lambda arr: (arr[1], arr[-1], arr[0]))
tbl = [header] + tbl
return [ map(str, row) for row in tbl ]
[docs]def subtype_analysis(mutation_data, patient_whitelist, significant, threshold):
# Parse mutation data and load sample2ty file
m, n, genespace, patientspace, G2T, T2G = mutation_data
sample2ty, tys = load_sample2ty_file(patient_whitelist)
# Count the number of samples from each cancer (sub)type
ty2numsamples = dict([(ty, 0) for ty in tys])
for sample, ty in sample2ty.iteritems(): ty2numsamples[ty] += 1
gene_specificity = [ subtype_specificity(g, sample2ty, ty2numsamples, G2T)
for g in genespace ]
gene2specificity = dict(zip(genespace, gene_specificity))
# Prune list if required
if significant:
gene2specificity = keep_significant(gene2specificity, threshold)
return gene2specificity
[docs]def run(args):
# Load mutation data
from multi_dendrix import white_and_blacklisting, load_db_with_cutoff
gene2include, sample2include, sample2ty = white_and_blacklisting(args)
mutation_data = load_db_with_cutoff(args.mutation_matrix, sample2include,
gene2include, args.cutoff)
# Conduct subtype analysis
gene2specificity = subtype_analysis(mutation_data, args.patient_whitelist,
not args.all, args.sig_threshold)
# Create TSV table to output results
subtype_tbl = create_subtype_tbl(gene2specificity)
subtype_output = "\n".join([ "\t".join(row) for row in subtype_tbl ])
# Output results to file
if args.output_file:
open(args.output_file, 'w').write( subtype_output )
else:
print subtype_output
if __name__ == "__main__": run(parse_args())