chiwanpark/preprocess.py

## preprocess.py
# -*- coding: utf-8 -*-
from builtins import float, len, sum, range, set

import click
from codecs import open
from collections import defaultdict
import math
import os


@click.group()
@click.option('--biogrid', default='')
@click.option('--rnaseq', default='')
@click.option('--output', default='')
@click.pass_context
def cli(context, biogrid, rnaseq, output):
    if biogrid:
        context.obj['BioGRID'] = load_biogrid(biogrid)
    if rnaseq:
        context.obj['RNAseq'] = load_rnaseq(rnaseq)
    if output:
        context.obj['output_path'] = output


def pearson(x, y):
    if len(x) != len(y):
        click.echo('[PEARSON] length of two sequences are different {seq1} {seq2}'.format(seq1=x, seq2=y))
        assert len(x) == len(y)

    n = len(x)

    sum_x = float(sum(x))
    sum_y = float(sum(y))
    sum_x_square = sum((e ** 2 for e in x))
    sum_y_square = sum((e ** 2 for e in y))
    sum_p = sum((x[i] * y[i] for i in range(n)))
    num = sum_p - (sum_x * sum_y / n)
    den = ((sum_x_square - sum_x ** 2 / n) * (sum_y_square - sum_y ** 2 / n)) ** 0.5

    if den == 0:
        return 0
    return num / den


def load_biogrid(data_path):
    click.echo('[BIOGRID] load start: {data_path}'.format(data_path=data_path))

    edges = set()
    line_count = 0
    with open(data_path, 'r') as f:
        for line in f:
            line_count += 1
            if line_count % 50000 == 0:
                click.echo('[BIOGRID] {line_count} lines loaded'.format(line_count=line_count))
            if line.startswith('#'):
                continue
            split = line.split('\t')
            edges.add((split[7].upper(), split[8].upper()))

    nodes = set((element for edge in edges for element in edge))

    click.echo('[BIOGRID] load complete! ({nodes} nodes, {edges} edges)'.format(nodes=len(nodes), edges=len(edges)))

    return nodes, edges


def load_rnaseq(data_path):
    click.echo('[RNAseq] load start: {data_path}'.format(data_path=data_path))
    result = defaultdict(lambda: [])

    for filename in os.listdir(data_path):
        if filename == 'metadata' or filename.startswith('.') or not filename.endswith('rsem.genes.normalized_results'):
            continue

        click.echo('[RNAseq] load file: {filename}'.format(filename=filename))

        file_path = os.path.join(data_path, filename)
        with open(file_path, 'r') as f:
            subdict = defaultdict(lambda: [])
            for line in f:
                if line.startswith('gene_id') or line.startswith('?'):
                    continue
                split = line.split('\t')
                gene_id = split[0].split('|')[0].upper()
                count = float(split[1])

                subdict[gene_id].append(math.log(count + 1, 2))

            for gene in subdict:
                result[gene].append(sum(subdict[gene]) / float(len(subdict[gene])))

    click.echo('[RNAseq] load complete! ({genes} genes)'.format(genes=len(result.keys())))

    return result


@cli.command()
@click.pass_context
def generate(context):
    biogrid_edge = context.obj['BioGRID'][1]
    rnaseq_table = context.obj['RNAseq']

    edge_count = 0
    with open(context.obj['output_path'], 'w') as f:
        for edge in biogrid_edge:
            edge_count += 1
            if edge_count % 50000 == 0:
                click.echo('[GENERATE] edge {edge} completed'.format(edge=edge_count))

            if edge[0] == edge[1]:
                continue

            result_a, result_b = rnaseq_table[edge[0]], rnaseq_table[edge[1]]
            if len(result_a) == 0 or len(result_b) == 0:
                continue

            coefficient = pearson(result_a, result_b)
            f.write('{node_a}\t{node_b}\t{coef}\n'.format(node_a=edge[0], node_b=edge[1], coef=coefficient))


if __name__ == '__main__':
    cli(obj={})

## requirements.txt
click==3.3
	# -- coding: utf-8 --
	from builtins import float, len, sum, range, set

	import click
	from codecs import open
	from collections import defaultdict
	import math
	import os


	@click.group()
	@click.option('--biogrid', default='')
	@click.option('--rnaseq', default='')
	@click.option('--output', default='')
	@click.pass_context
	def cli(context, biogrid, rnaseq, output):
	if biogrid:
	context.obj['BioGRID'] = load_biogrid(biogrid)
	if rnaseq:
	context.obj['RNAseq'] = load_rnaseq(rnaseq)
	if output:
	context.obj['output_path'] = output


	def pearson(x, y):
	if len(x) != len(y):
	click.echo('[PEARSON] length of two sequences are different {seq1} {seq2}'.format(seq1=x, seq2=y))
	assert len(x) == len(y)

	n = len(x)

	sum_x = float(sum(x))
	sum_y = float(sum(y))
	sum_x_square = sum((e ** 2 for e in x))
	sum_y_square = sum((e ** 2 for e in y))
	sum_p = sum((x[i] * y[i] for i in range(n)))
	num = sum_p - (sum_x * sum_y / n)
	den = ((sum_x_square - sum_x ** 2 / n) * (sum_y_square - sum_y 2 / n)) 0.5

	if den == 0:
	return 0
	return num / den


	def load_biogrid(data_path):
	click.echo('[BIOGRID] load start: {data_path}'.format(data_path=data_path))

	edges = set()
	line_count = 0
	with open(data_path, 'r') as f:
	for line in f:
	line_count += 1
	if line_count % 50000 == 0:
	click.echo('[BIOGRID] {line_count} lines loaded'.format(line_count=line_count))
	if line.startswith('#'):
	continue
	split = line.split('\t')
	edges.add((split[7].upper(), split[8].upper()))

	nodes = set((element for edge in edges for element in edge))

	click.echo('[BIOGRID] load complete! ({nodes} nodes, {edges} edges)'.format(nodes=len(nodes), edges=len(edges)))

	return nodes, edges


	def load_rnaseq(data_path):
	click.echo('[RNAseq] load start: {data_path}'.format(data_path=data_path))
	result = defaultdict(lambda: [])

	for filename in os.listdir(data_path):
	if filename == 'metadata' or filename.startswith('.') or not filename.endswith('rsem.genes.normalized_results'):
	continue

	click.echo('[RNAseq] load file: {filename}'.format(filename=filename))

	file_path = os.path.join(data_path, filename)
	with open(file_path, 'r') as f:
	subdict = defaultdict(lambda: [])
	for line in f:
	if line.startswith('gene_id') or line.startswith('?'):
	continue
	split = line.split('\t')
	gene_id = split[0].split('\|')[0].upper()
	count = float(split[1])

	subdict[gene_id].append(math.log(count + 1, 2))

	for gene in subdict:
	result[gene].append(sum(subdict[gene]) / float(len(subdict[gene])))

	click.echo('[RNAseq] load complete! ({genes} genes)'.format(genes=len(result.keys())))

	return result


	@cli.command()
	@click.pass_context
	def generate(context):
	biogrid_edge = context.obj['BioGRID'][1]
	rnaseq_table = context.obj['RNAseq']

	edge_count = 0
	with open(context.obj['output_path'], 'w') as f:
	for edge in biogrid_edge:
	edge_count += 1
	if edge_count % 50000 == 0:
	click.echo('[GENERATE] edge {edge} completed'.format(edge=edge_count))

	if edge[0] == edge[1]:
	continue

	result_a, result_b = rnaseq_table[edge[0]], rnaseq_table[edge[1]]
	if len(result_a) == 0 or len(result_b) == 0:
	continue

	coefficient = pearson(result_a, result_b)
	f.write('{node_a}\t{node_b}\t{coef}\n'.format(node_a=edge[0], node_b=edge[1], coef=coefficient))


	if __name__ == '__main__':
	cli(obj={})