Source code for subtype_specific_genes

#!/usr/bin/python

# Load required modules
# Try and load scipy's fisher's exact test function
import sys
try:
	from scipy.stats import fisher_exact as pvalue
	def fisher_exact(tbl): return pvalue(tbl, alternative='greater')[1]
except ImportError:
    try:
        from fisher import pvalue as pvalue
        def fisher_exact(tbl): return pvalue(*tbl).right_tail
    except ImportError:
        print 'Fatal Error: Neither SciPyv0.11 or fisher0.1.4 modules '\
              '(http://goo.gl/zYrLr) are installed.'
        sys.exit(1)

def parse_args(input_list=None):
    # Parse arguments
    import argparse
    class Args: pass
    args = Args()
    description = 'Calculates whether any genes are subtype specific for the '\
                   'given mutation data.'
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('-m', '--mutation_matrix', required=True,
                        help='File name for mutation data.')
    parser.add_argument('-c', '--cutoff', type=int, default=0, 
                        help='Minimum gene mutation frequency.')
    parser.add_argument('-p', '--patient_whitelist', required=True,
                        help='Space-separated file of patient IDs and their '\
                              '(sub)type to be tested against.')
    parser.add_argument('-bp', '--patient_blacklist', default=None,
                        help='File of patients to be excluded.')
    parser.add_argument('-g', '--gene_whitelist', default=None,
                        help='File of genes to be included.')
    parser.add_argument('-bg', '--gene_blacklist', default=None,
                        help='File of genes to be excluded.')
    parser.add_argument('-o', '--output_file', default=None,
                        help='Name of output file.')
    parser.add_argument('--sig_threshold', default=0.05, type=float,
                        help='Significance threshold.')
    parser.add_argument('-a', '--all', default=False, action='store_true',
                        help='Flag to output all associations.')
    parser.add_argument('-v', '--verbose', default=False, action='store_true',
                        help='Flag verbose mode.')

    # If called from the command line, parse command line args.
    if input_list: parser.parse_args(input_list, namespace=args)
    else: parser.parse_args(namespace=args)
    
    return args	

[docs]def ty_contingency_table(ty, ty2mutations, tys, ty2numsamples):
    type_mutations = len(ty2mutations[ty])
    non_type_mutations = sum([len(ty2mutations[ty2]) for ty2 in tys if ty != ty2])

    type_normal = ty2numsamples[ty] - len(ty2mutations[ty])
    non_type_normal = sum([ty2numsamples[ty2] - len(ty2mutations[ty2]) for ty2 in tys if ty != ty2])

    return type_mutations, type_normal, non_type_mutations, non_type_normal

[docs]def subtype_specificity(gene, sample2ty, ty2numsamples, G2T):
    tys = sorted(ty2numsamples.keys())
    num_tests = float(len(tys))    

    # Count the number of mutations in the subgraph in each cancer type
    ty2mutations = dict([(ty, set()) for ty in tys])
    for sample in G2T[gene]:
        try: ty2mutations[sample2ty[sample]].add( sample )
        except KeyError: continue #  ignore samples with no type

    h = dict()
    for ty in tys:
        cont_table = ty_contingency_table(ty, ty2mutations, tys, ty2numsamples)
        pval = fisher_exact(cont_table)
        corrected_pval = pval * num_tests if pval * num_tests <=1 else 1
        type_mutations, type_normal, non_type_mutations, non_type_normal = cont_table
        
        # Results
        h[ty] = [ type_mutations, type_normal, non_type_mutations,
                  non_type_normal, pval, corrected_pval ]
    return h

[docs]def load_sample2ty_file(sample2ty_file):
    sample2ty = dict([l.rstrip().split("\t")[:2] for l in open(sample2ty_file)
    	              if not l.startswith("#") ])
    tys = sorted(set( [ty for sample, ty in sample2ty.iteritems()]))
    return sample2ty, tys

[docs]def keep_significant(gene2specificity, threshold):
	sig_gene2specificity = dict()
	for g, ty2analysis in gene2specificity.iteritems():
		h = dict()
		for ty, analysis in ty2analysis.iteritems():
			if analysis[-1] < threshold: # the corrected pvalue
				h[ty] = analysis
		
		if h.keys() != []:
			sig_gene2specificity[g] = h

	return sig_gene2specificity

[docs]def create_subtype_tbl(gene2specificity):
    header = ['Gene', '(Sub)Type', 'Type_Mutations', 'Type_Normal',
              'Non_Type_Mutations', 'Non_Type_Normal', 'P_Value',
              'Bonferonni_Corrected_P_Value' ]
    tbl = [  ]
    for gene, ty2analysis in gene2specificity.iteritems():
        for ty, analysis in ty2analysis.iteritems():
            tbl.append( [gene, ty] + analysis )
    
    # Sort rows by (sub)type, then p-value, then gene name
    tbl.sort(key=lambda arr: (arr[1], arr[-1], arr[0]))
    tbl = [header] + tbl
    return [ map(str, row) for row in tbl ]

[docs]def subtype_analysis(mutation_data, patient_whitelist, significant, threshold):
	# Parse mutation data and load sample2ty file
    m, n, genespace, patientspace, G2T, T2G = mutation_data
    sample2ty, tys = load_sample2ty_file(patient_whitelist)

    # Count the number of samples from each cancer (sub)type
    ty2numsamples = dict([(ty, 0) for ty in tys])
    for sample, ty in sample2ty.iteritems(): ty2numsamples[ty] += 1

    gene_specificity = [ subtype_specificity(g, sample2ty, ty2numsamples, G2T)
                         for g in genespace ]
    gene2specificity = dict(zip(genespace, gene_specificity))

    # Prune list if required
    if significant:
        gene2specificity = keep_significant(gene2specificity, threshold)

    return gene2specificity


[docs]def run(args):
	# Load mutation data
    from multi_dendrix import white_and_blacklisting, load_db_with_cutoff
    gene2include, sample2include, sample2ty = white_and_blacklisting(args)
    mutation_data = load_db_with_cutoff(args.mutation_matrix, sample2include,
                                       gene2include, args.cutoff)

    # Conduct subtype analysis
    gene2specificity = subtype_analysis(mutation_data, args.patient_whitelist,
    	                                not args.all, args.sig_threshold)

    # Create TSV table to output results
    subtype_tbl = create_subtype_tbl(gene2specificity)
    subtype_output = "\n".join([ "\t".join(row) for row in subtype_tbl ])

    # Output results to file
    if args.output_file:
        open(args.output_file, 'w').write( subtype_output )
    else:
        print subtype_output

if __name__ == "__main__": run(parse_args())
Navigation

Quick search

Source code for subtype_specific_genes

Navigation