Gabriel Pratt gpratt

## entropy_calculation.py
annotated_bedtool_header = ['chrom', 'start', "stop", "name", "score", "strand", "annotation", "gene_id"]
full_header = ["chrom", "start", "stop", "full_name", "ip_reads", "input_reads", "p_val", "chisq", "test_type",
               "enrichment", "log10_p_val", "log2_fold_change"]
def get_full_from_annotated(fn):
    stripped_fn = ".".join(fn.split(".")[:-3])
    return stripped_fn + ".full.compressed2.bed.full"

def calculate_entropy(row, total_ip_reads, total_input_reads):
    p_ip = float(row.ip_reads) / total_ip_reads
    p_input = float(row.input_reads) / total_input_reads

## example_subplot_plot.py
total_datasets = len(merged_data.groupby(level=0))
num_cols = 4
num_rows = (total_datasets / 4) + 1
count = 0

with dataviz.Figure(os.path.join(img_dir, "increase_in_enriched_regions.svg"), figsize=(4* num_cols, 4*num_rows)) as fig:
    for uID, df in merged_data.groupby(level='uID'):
        count += 1
        ax = fig.add_subplot(num_rows, num_cols, count)

## array_job.py
class ArrayJob():
    def __init__(self):
        self._epilogue = "eval ${cmd[$PBS_ARRAYID]}"

    def _prologue(self, name, count, run_dir, ppn=1, walltime=8):
        return """#!/bin/bash
#PBS -N {0}
#PBS -l nodes=1:ppn={3}
#PBS -o {0}.out
#PBS -e {0}.err

## gist:e79b99430fbd75e8b7771abdfaea71b6
class ArrayJob():
    def __init__(self):
        self._epilogue = "eval ${cmd[$PBS_ARRAYID]}"

    def _prologue(self, name, count, run_dir, ppn=1, walltime=8):
        return """#!/bin/bash
#PBS -N {0}
#PBS -l nodes=1:ppn={3}
#PBS -o {0}.out
#PBS -e {0}.err

## array_job.py
class ArrayJob():
    def __init__(self):
        self._epilogue = "eval ${cmd[$PBS_ARRAYID]}"

    def _prologue(self, name, count, run_dir, ppn=1, walltime=8):
        return """#!/bin/bash
#PBS -N {0}
#PBS -l nodes=1:ppn={3}
#PBS -o {0}.out
#PBS -e {0}.err

## plot_pdf.py
def pdf(data, bins=50):
    data = np.array(data, dtype=float)
    minimum = np.min(data) - .000001
    maximum = np.max(data) + .000001
    pos = np.linspace(minimum, maximum, bins + 1)
    xs = np.linspace(minimum, maximum, bins + 1)[:-1]
    ys = np.linspace(minimum, maximum, bins + 1)[1:]
    pdf = np.ndarray(shape=(bins + 1, 1))
    pdf[0] = 0
    for i, (x, y) in enumerate(zip(xs, ys)):

## known_rbps.py
import pandas as pd

mouse_gene_id_names = pd.read_table("/nas3/gpratt/Dropbox/TAF15/Data/mouse_integration/mouse_gene_id_to_names.txt", index_col=0)
human_mouse_genes = pd.read_table("/nas3/gpratt/projects/taf15/mouse_human_genes.txt", index_col=2)

known_rbps = pd.read_excel("nrg3813-s3.xls", "RBP table", index_col=2)
known_tfs = pd.read_excel("nrg3813-s4.xls", "human TFs", index_col=1)

known_tfs['gene_id'] = known_tfs.index
known_rbps['gene_id'] = known_rbps.index

## flip_hex_value.py
def flip_hex_value(hex_value):
    has = ""
    if hex_value.startswith("#"):
        has = "#"
    hex_value = hex_value.lstrip("#")
    bits = bin(int(hex_value, 16))[2:]
    flipped = "".join(["1" if bool(int(bit)) ^ True else "0" for bit in bits])
    return has + "{0:0>{width}x}".format(int(flipped, 2), width=len(hex_value))
flip_hex_value("#262626")

## count_to_rpkm_function.py
def counts_to_rpkm(featureCountsTable):
    counts = featureCountsTable.ix[:,5:]
    lengths = featureCountsTable['Length']
    mapped_reads = counts.sum()
    return (counts * pow(10,9)).div(mapped_reads, axis=1).div(lengths, axis=0)

## go_enrichment_plotter.py
def plot_go_enrichment(df, filter_value=None, **kwargs):

    new_index = []
    for index, description in izip(df.index, df['GO Term Description']):
        new_index.append(list(index[:-1]) + [description])
    df.index = pd.MultiIndex.from_tuples(new_index)

    go_matrix = df['Bonferroni-corrected Hypergeometric p-Value'].apply(lambda x: -1 * np.log10(x))
    go_matrix = go_matrix.unstack(range(len(go_matrix.index.levels) - 1))
    go_matrix = go_matrix.fillna(0)
	annotated_bedtool_header = ['chrom', 'start', "stop", "name", "score", "strand", "annotation", "gene_id"]
	full_header = ["chrom", "start", "stop", "full_name", "ip_reads", "input_reads", "p_val", "chisq", "test_type",
	"enrichment", "log10_p_val", "log2_fold_change"]
	def get_full_from_annotated(fn):
	stripped_fn = ".".join(fn.split(".")[:-3])
	return stripped_fn + ".full.compressed2.bed.full"

	def calculate_entropy(row, total_ip_reads, total_input_reads):
	p_ip = float(row.ip_reads) / total_ip_reads
	p_input = float(row.input_reads) / total_input_reads
	total_datasets = len(merged_data.groupby(level=0))
	num_cols = 4
	num_rows = (total_datasets / 4) + 1
	count = 0

	with dataviz.Figure(os.path.join(img_dir, "increase_in_enriched_regions.svg"), figsize=(4* num_cols, 4*num_rows)) as fig:
	for uID, df in merged_data.groupby(level='uID'):
	count += 1
	ax = fig.add_subplot(num_rows, num_cols, count)
	class ArrayJob():
	def __init__(self):
	self._epilogue = "eval ${cmd[$PBS_ARRAYID]}"

	def _prologue(self, name, count, run_dir, ppn=1, walltime=8):
	return """#!/bin/bash
	#PBS -N {0}
	#PBS -l nodes=1:ppn={3}
	#PBS -o {0}.out
	#PBS -e {0}.err
	def pdf(data, bins=50):
	data = np.array(data, dtype=float)
	minimum = np.min(data) - .000001
	maximum = np.max(data) + .000001
	pos = np.linspace(minimum, maximum, bins + 1)
	xs = np.linspace(minimum, maximum, bins + 1)[:-1]
	ys = np.linspace(minimum, maximum, bins + 1)[1:]
	pdf = np.ndarray(shape=(bins + 1, 1))
	pdf[0] = 0
	for i, (x, y) in enumerate(zip(xs, ys)):
	import pandas as pd

	mouse_gene_id_names = pd.read_table("/nas3/gpratt/Dropbox/TAF15/Data/mouse_integration/mouse_gene_id_to_names.txt", index_col=0)
	human_mouse_genes = pd.read_table("/nas3/gpratt/projects/taf15/mouse_human_genes.txt", index_col=2)

	known_rbps = pd.read_excel("nrg3813-s3.xls", "RBP table", index_col=2)
	known_tfs = pd.read_excel("nrg3813-s4.xls", "human TFs", index_col=1)

	known_tfs['gene_id'] = known_tfs.index
	known_rbps['gene_id'] = known_rbps.index
	def flip_hex_value(hex_value):
	has = ""
	if hex_value.startswith("#"):
	has = "#"
	hex_value = hex_value.lstrip("#")
	bits = bin(int(hex_value, 16))[2:]
	flipped = "".join(["1" if bool(int(bit)) ^ True else "0" for bit in bits])
	return has + "{0:0>{width}x}".format(int(flipped, 2), width=len(hex_value))
	flip_hex_value("#262626")
	def counts_to_rpkm(featureCountsTable):
	counts = featureCountsTable.ix[:,5:]
	lengths = featureCountsTable['Length']
	mapped_reads = counts.sum()
	return (counts * pow(10,9)).div(mapped_reads, axis=1).div(lengths, axis=0)
	def plot_go_enrichment(df, filter_value=None, **kwargs):

	new_index = []
	for index, description in izip(df.index, df['GO Term Description']):
	new_index.append(list(index[:-1]) + [description])
	df.index = pd.MultiIndex.from_tuples(new_index)

	go_matrix = df['Bonferroni-corrected Hypergeometric p-Value'].apply(lambda x: -1 * np.log10(x))
	go_matrix = go_matrix.unstack(range(len(go_matrix.index.levels) - 1))
	go_matrix = go_matrix.fillna(0)