Multi-Dendrix Logo

Source code for subtype_specific_genes

#!/usr/bin/python

# Load required modules
# Try and load scipy's fisher's exact test function
import sys
try:
	from scipy.stats import fisher_exact as pvalue
	def fisher_exact(tbl): return pvalue(tbl, alternative='greater')[1]
except ImportError:
    try:
        from fisher import pvalue as pvalue
        def fisher_exact(tbl): return pvalue(*tbl).right_tail
    except ImportError:
        print 'Fatal Error: Neither SciPyv0.11 or fisher0.1.4 modules '\
              '(http://goo.gl/zYrLr) are installed.'
        sys.exit(1)

def parse_args(input_list=None):
    # Parse arguments
    import argparse
    class Args: pass
    args = Args()
    description = 'Calculates whether any genes are subtype specific for the '\
                   'given mutation data.'
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('-m', '--mutation_matrix', required=True,
                        help='File name for mutation data.')
    parser.add_argument('-c', '--cutoff', type=int, default=0, 
                        help='Minimum gene mutation frequency.')
    parser.add_argument('-p', '--patient_whitelist', required=True,
                        help='Space-separated file of patient IDs and their '\
                              '(sub)type to be tested against.')
    parser.add_argument('-bp', '--patient_blacklist', default=None,
                        help='File of patients to be excluded.')
    parser.add_argument('-g', '--gene_whitelist', default=None,
                        help='File of genes to be included.')
    parser.add_argument('-bg', '--gene_blacklist', default=None,
                        help='File of genes to be excluded.')
    parser.add_argument('-o', '--output_file', default=None,
                        help='Name of output file.')
    parser.add_argument('--sig_threshold', default=0.05, type=float,
                        help='Significance threshold.')
    parser.add_argument('-a', '--all', default=False, action='store_true',
                        help='Flag to output all associations.')
    parser.add_argument('-v', '--verbose', default=False, action='store_true',
                        help='Flag verbose mode.')

    # If called from the command line, parse command line args.
    if input_list: parser.parse_args(input_list, namespace=args)
    else: parser.parse_args(namespace=args)
    
    return args	

[docs]def ty_contingency_table(ty, ty2mutations, tys, ty2numsamples): type_mutations = len(ty2mutations[ty]) non_type_mutations = sum([len(ty2mutations[ty2]) for ty2 in tys if ty != ty2]) type_normal = ty2numsamples[ty] - len(ty2mutations[ty]) non_type_normal = sum([ty2numsamples[ty2] - len(ty2mutations[ty2]) for ty2 in tys if ty != ty2]) return type_mutations, type_normal, non_type_mutations, non_type_normal
[docs]def subtype_specificity(gene, sample2ty, ty2numsamples, G2T): tys = sorted(ty2numsamples.keys()) num_tests = float(len(tys)) # Count the number of mutations in the subgraph in each cancer type ty2mutations = dict([(ty, set()) for ty in tys]) for sample in G2T[gene]: try: ty2mutations[sample2ty[sample]].add( sample ) except KeyError: continue # ignore samples with no type h = dict() for ty in tys: cont_table = ty_contingency_table(ty, ty2mutations, tys, ty2numsamples) pval = fisher_exact(cont_table) corrected_pval = pval * num_tests if pval * num_tests <=1 else 1 type_mutations, type_normal, non_type_mutations, non_type_normal = cont_table # Results h[ty] = [ type_mutations, type_normal, non_type_mutations, non_type_normal, pval, corrected_pval ] return h
[docs]def load_sample2ty_file(sample2ty_file): sample2ty = dict([l.rstrip().split("\t")[:2] for l in open(sample2ty_file) if not l.startswith("#") ]) tys = sorted(set( [ty for sample, ty in sample2ty.iteritems()])) return sample2ty, tys
[docs]def keep_significant(gene2specificity, threshold): sig_gene2specificity = dict() for g, ty2analysis in gene2specificity.iteritems(): h = dict() for ty, analysis in ty2analysis.iteritems(): if analysis[-1] < threshold: # the corrected pvalue h[ty] = analysis if h.keys() != []: sig_gene2specificity[g] = h return sig_gene2specificity
[docs]def create_subtype_tbl(gene2specificity): header = ['Gene', '(Sub)Type', 'Type_Mutations', 'Type_Normal', 'Non_Type_Mutations', 'Non_Type_Normal', 'P_Value', 'Bonferonni_Corrected_P_Value' ] tbl = [ ] for gene, ty2analysis in gene2specificity.iteritems(): for ty, analysis in ty2analysis.iteritems(): tbl.append( [gene, ty] + analysis ) # Sort rows by (sub)type, then p-value, then gene name tbl.sort(key=lambda arr: (arr[1], arr[-1], arr[0])) tbl = [header] + tbl return [ map(str, row) for row in tbl ]
[docs]def subtype_analysis(mutation_data, patient_whitelist, significant, threshold): # Parse mutation data and load sample2ty file m, n, genespace, patientspace, G2T, T2G = mutation_data sample2ty, tys = load_sample2ty_file(patient_whitelist) # Count the number of samples from each cancer (sub)type ty2numsamples = dict([(ty, 0) for ty in tys]) for sample, ty in sample2ty.iteritems(): ty2numsamples[ty] += 1 gene_specificity = [ subtype_specificity(g, sample2ty, ty2numsamples, G2T) for g in genespace ] gene2specificity = dict(zip(genespace, gene_specificity)) # Prune list if required if significant: gene2specificity = keep_significant(gene2specificity, threshold) return gene2specificity
[docs]def run(args): # Load mutation data from multi_dendrix import white_and_blacklisting, load_db_with_cutoff gene2include, sample2include, sample2ty = white_and_blacklisting(args) mutation_data = load_db_with_cutoff(args.mutation_matrix, sample2include, gene2include, args.cutoff) # Conduct subtype analysis gene2specificity = subtype_analysis(mutation_data, args.patient_whitelist, not args.all, args.sig_threshold) # Create TSV table to output results subtype_tbl = create_subtype_tbl(gene2specificity) subtype_output = "\n".join([ "\t".join(row) for row in subtype_tbl ]) # Output results to file if args.output_file: open(args.output_file, 'w').write( subtype_output ) else: print subtype_output
if __name__ == "__main__": run(parse_args())