Created
October 10, 2014 16:15
-
-
Save chiwanpark/076046bff5f2a2e8d8f8 to your computer and use it in GitHub Desktop.
BioGRID/TCGA Preprocessing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from builtins import float, len, sum, range, set | |
import click | |
from codecs import open | |
from collections import defaultdict | |
import math | |
import os | |
@click.group() | |
@click.option('--biogrid', default='') | |
@click.option('--rnaseq', default='') | |
@click.option('--output', default='') | |
@click.pass_context | |
def cli(context, biogrid, rnaseq, output): | |
if biogrid: | |
context.obj['BioGRID'] = load_biogrid(biogrid) | |
if rnaseq: | |
context.obj['RNAseq'] = load_rnaseq(rnaseq) | |
if output: | |
context.obj['output_path'] = output | |
def pearson(x, y): | |
if len(x) != len(y): | |
click.echo('[PEARSON] length of two sequences are different {seq1} {seq2}'.format(seq1=x, seq2=y)) | |
assert len(x) == len(y) | |
n = len(x) | |
sum_x = float(sum(x)) | |
sum_y = float(sum(y)) | |
sum_x_square = sum((e ** 2 for e in x)) | |
sum_y_square = sum((e ** 2 for e in y)) | |
sum_p = sum((x[i] * y[i] for i in range(n))) | |
num = sum_p - (sum_x * sum_y / n) | |
den = ((sum_x_square - sum_x ** 2 / n) * (sum_y_square - sum_y ** 2 / n)) ** 0.5 | |
if den == 0: | |
return 0 | |
return num / den | |
def load_biogrid(data_path): | |
click.echo('[BIOGRID] load start: {data_path}'.format(data_path=data_path)) | |
edges = set() | |
line_count = 0 | |
with open(data_path, 'r') as f: | |
for line in f: | |
line_count += 1 | |
if line_count % 50000 == 0: | |
click.echo('[BIOGRID] {line_count} lines loaded'.format(line_count=line_count)) | |
if line.startswith('#'): | |
continue | |
split = line.split('\t') | |
edges.add((split[7].upper(), split[8].upper())) | |
nodes = set((element for edge in edges for element in edge)) | |
click.echo('[BIOGRID] load complete! ({nodes} nodes, {edges} edges)'.format(nodes=len(nodes), edges=len(edges))) | |
return nodes, edges | |
def load_rnaseq(data_path): | |
click.echo('[RNAseq] load start: {data_path}'.format(data_path=data_path)) | |
result = defaultdict(lambda: []) | |
for filename in os.listdir(data_path): | |
if filename == 'metadata' or filename.startswith('.') or not filename.endswith('rsem.genes.normalized_results'): | |
continue | |
click.echo('[RNAseq] load file: {filename}'.format(filename=filename)) | |
file_path = os.path.join(data_path, filename) | |
with open(file_path, 'r') as f: | |
subdict = defaultdict(lambda: []) | |
for line in f: | |
if line.startswith('gene_id') or line.startswith('?'): | |
continue | |
split = line.split('\t') | |
gene_id = split[0].split('|')[0].upper() | |
count = float(split[1]) | |
subdict[gene_id].append(math.log(count + 1, 2)) | |
for gene in subdict: | |
result[gene].append(sum(subdict[gene]) / float(len(subdict[gene]))) | |
click.echo('[RNAseq] load complete! ({genes} genes)'.format(genes=len(result.keys()))) | |
return result | |
@cli.command() | |
@click.pass_context | |
def generate(context): | |
biogrid_edge = context.obj['BioGRID'][1] | |
rnaseq_table = context.obj['RNAseq'] | |
edge_count = 0 | |
with open(context.obj['output_path'], 'w') as f: | |
for edge in biogrid_edge: | |
edge_count += 1 | |
if edge_count % 50000 == 0: | |
click.echo('[GENERATE] edge {edge} completed'.format(edge=edge_count)) | |
if edge[0] == edge[1]: | |
continue | |
result_a, result_b = rnaseq_table[edge[0]], rnaseq_table[edge[1]] | |
if len(result_a) == 0 or len(result_b) == 0: | |
continue | |
coefficient = pearson(result_a, result_b) | |
f.write('{node_a}\t{node_b}\t{coef}\n'.format(node_a=edge[0], node_b=edge[1], coef=coefficient)) | |
if __name__ == '__main__': | |
cli(obj={}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
click==3.3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment