Skip to content

Instantly share code, notes, and snippets.

@chiwanpark
Created October 10, 2014 16:15
Show Gist options
  • Save chiwanpark/076046bff5f2a2e8d8f8 to your computer and use it in GitHub Desktop.
Save chiwanpark/076046bff5f2a2e8d8f8 to your computer and use it in GitHub Desktop.
BioGRID/TCGA Preprocessing
# -*- coding: utf-8 -*-
from builtins import float, len, sum, range, set
import click
from codecs import open
from collections import defaultdict
import math
import os
@click.group()
@click.option('--biogrid', default='')
@click.option('--rnaseq', default='')
@click.option('--output', default='')
@click.pass_context
def cli(context, biogrid, rnaseq, output):
if biogrid:
context.obj['BioGRID'] = load_biogrid(biogrid)
if rnaseq:
context.obj['RNAseq'] = load_rnaseq(rnaseq)
if output:
context.obj['output_path'] = output
def pearson(x, y):
if len(x) != len(y):
click.echo('[PEARSON] length of two sequences are different {seq1} {seq2}'.format(seq1=x, seq2=y))
assert len(x) == len(y)
n = len(x)
sum_x = float(sum(x))
sum_y = float(sum(y))
sum_x_square = sum((e ** 2 for e in x))
sum_y_square = sum((e ** 2 for e in y))
sum_p = sum((x[i] * y[i] for i in range(n)))
num = sum_p - (sum_x * sum_y / n)
den = ((sum_x_square - sum_x ** 2 / n) * (sum_y_square - sum_y ** 2 / n)) ** 0.5
if den == 0:
return 0
return num / den
def load_biogrid(data_path):
click.echo('[BIOGRID] load start: {data_path}'.format(data_path=data_path))
edges = set()
line_count = 0
with open(data_path, 'r') as f:
for line in f:
line_count += 1
if line_count % 50000 == 0:
click.echo('[BIOGRID] {line_count} lines loaded'.format(line_count=line_count))
if line.startswith('#'):
continue
split = line.split('\t')
edges.add((split[7].upper(), split[8].upper()))
nodes = set((element for edge in edges for element in edge))
click.echo('[BIOGRID] load complete! ({nodes} nodes, {edges} edges)'.format(nodes=len(nodes), edges=len(edges)))
return nodes, edges
def load_rnaseq(data_path):
click.echo('[RNAseq] load start: {data_path}'.format(data_path=data_path))
result = defaultdict(lambda: [])
for filename in os.listdir(data_path):
if filename == 'metadata' or filename.startswith('.') or not filename.endswith('rsem.genes.normalized_results'):
continue
click.echo('[RNAseq] load file: {filename}'.format(filename=filename))
file_path = os.path.join(data_path, filename)
with open(file_path, 'r') as f:
subdict = defaultdict(lambda: [])
for line in f:
if line.startswith('gene_id') or line.startswith('?'):
continue
split = line.split('\t')
gene_id = split[0].split('|')[0].upper()
count = float(split[1])
subdict[gene_id].append(math.log(count + 1, 2))
for gene in subdict:
result[gene].append(sum(subdict[gene]) / float(len(subdict[gene])))
click.echo('[RNAseq] load complete! ({genes} genes)'.format(genes=len(result.keys())))
return result
@cli.command()
@click.pass_context
def generate(context):
biogrid_edge = context.obj['BioGRID'][1]
rnaseq_table = context.obj['RNAseq']
edge_count = 0
with open(context.obj['output_path'], 'w') as f:
for edge in biogrid_edge:
edge_count += 1
if edge_count % 50000 == 0:
click.echo('[GENERATE] edge {edge} completed'.format(edge=edge_count))
if edge[0] == edge[1]:
continue
result_a, result_b = rnaseq_table[edge[0]], rnaseq_table[edge[1]]
if len(result_a) == 0 or len(result_b) == 0:
continue
coefficient = pearson(result_a, result_b)
f.write('{node_a}\t{node_b}\t{coef}\n'.format(node_a=edge[0], node_b=edge[1], coef=coefficient))
if __name__ == '__main__':
cli(obj={})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment