Skip to content

Instantly share code, notes, and snippets.

@gpratt
Created March 29, 2018 19:51
Show Gist options
  • Save gpratt/b2edf823062d2cf1fddd61b802ea5520 to your computer and use it in GitHub Desktop.
Save gpratt/b2edf823062d2cf1fddd61b802ea5520 to your computer and use it in GitHub Desktop.
Entropy_Code
annotated_bedtool_header = ['chrom', 'start', "stop", "name", "score", "strand", "annotation", "gene_id"]
full_header = ["chrom", "start", "stop", "full_name", "ip_reads", "input_reads", "p_val", "chisq", "test_type",
"enrichment", "log10_p_val", "log2_fold_change"]
def get_full_from_annotated(fn):
stripped_fn = ".".join(fn.split(".")[:-3])
return stripped_fn + ".full.compressed2.bed.full"
def calculate_entropy(row, total_ip_reads, total_input_reads):
p_ip = float(row.ip_reads) / total_ip_reads
p_input = float(row.input_reads) / total_input_reads
return p_ip * np.log2(p_ip / p_input)
def get_entropy_from_annotated(fn):
fn = os.path.basename(fn)
stripped_fn = ".".join(fn.split(".")[:-3])
stripped_fn = os.path.join(out_dir, stripped_fn)
return stripped_fn + ".full.compressed2.bed.full.entropy.bed"
def sum_entropy(filtered_peaks, original_peaks):
out_file = "/home/gpratt/Dropbox/EricGabe_ENCODE/enriched_peaks/" + os.path.basename(filtered_peaks) + ".entropy.bed"
entropy = pd.read_table(get_entropy_from_annotated(original_peaks))
filtered_peaks = pd.read_table(filtered_peaks, names=annotated_bedtool_header)
merged_peaks = pd.merge(filtered_peaks, entropy,
left_on=['chrom', 'start', 'stop'],
right_on=['chrom', 'start', 'stop'])
merged_peaks.to_csv(out_file,
sep="\t", header=None, index=False)
return merged_peaks.entropy.sum()
def sum_entropy_row(row):
#Sadly the majority of the time in this operation is opening the files, can't make it faster :(
try:
entropy = sum_entropy(row.filtered_moderate, row['input_norm'])
except Exception as e:
print e, "something went wrong"
entropy = 0
return entropy
for name, row in tqdm(list(merged_data.iterrows())):
full_fn = get_full_from_annotated(row['input_norm'])
out_fn = os.path.join(out_dir, os.path.basename(full_fn) + ".entropy.bed")
if os.path.exists(out_fn):
continue
ip_reads = row['CLIP_counts']
input_reads = row['INPUT_counts']
read_counts = pd.read_table(full_fn, names=full_header)
if len(read_counts) != 0:
tool = functools.partial(calculate_entropy, total_ip_reads=ip_reads, total_input_reads=input_reads)
read_counts['entropy'] = read_counts.apply(tool, axis=1)
read_counts.to_csv(out_fn, sep="\t", index=False, header=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment