gpratt/entropy_calculation.py

## entropy_calculation.py
annotated_bedtool_header = ['chrom', 'start', "stop", "name", "score", "strand", "annotation", "gene_id"]
full_header = ["chrom", "start", "stop", "full_name", "ip_reads", "input_reads", "p_val", "chisq", "test_type",
               "enrichment", "log10_p_val", "log2_fold_change"]
def get_full_from_annotated(fn):
    stripped_fn = ".".join(fn.split(".")[:-3])
    return stripped_fn + ".full.compressed2.bed.full"

def calculate_entropy(row, total_ip_reads, total_input_reads):
    p_ip = float(row.ip_reads) / total_ip_reads
    p_input = float(row.input_reads) / total_input_reads

    return p_ip * np.log2(p_ip / p_input)

def get_entropy_from_annotated(fn):
    fn = os.path.basename(fn)
    stripped_fn = ".".join(fn.split(".")[:-3])
    stripped_fn = os.path.join(out_dir, stripped_fn)
    return stripped_fn + ".full.compressed2.bed.full.entropy.bed"

def sum_entropy(filtered_peaks, original_peaks):
    out_file = "/home/gpratt/Dropbox/EricGabe_ENCODE/enriched_peaks/" + os.path.basename(filtered_peaks) + ".entropy.bed"
    entropy = pd.read_table(get_entropy_from_annotated(original_peaks))
    filtered_peaks = pd.read_table(filtered_peaks, names=annotated_bedtool_header)

    merged_peaks = pd.merge(filtered_peaks, entropy,
             left_on=['chrom', 'start', 'stop'],
             right_on=['chrom', 'start', 'stop'])
    merged_peaks.to_csv(out_file,
                        sep="\t", header=None, index=False)
    return merged_peaks.entropy.sum()

def sum_entropy_row(row):
    #Sadly the majority of the time in this operation is opening the files, can't make it faster :(
    try:
        entropy = sum_entropy(row.filtered_moderate, row['input_norm'])
    except Exception as e:
        print e, "something went wrong"
        entropy = 0
    return entropy


for name, row in tqdm(list(merged_data.iterrows())):
  full_fn = get_full_from_annotated(row['input_norm'])
  out_fn = os.path.join(out_dir, os.path.basename(full_fn) + ".entropy.bed")
  if os.path.exists(out_fn):
      continue

  ip_reads = row['CLIP_counts']
  input_reads = row['INPUT_counts']

  read_counts = pd.read_table(full_fn, names=full_header)
  if len(read_counts) != 0:
      tool = functools.partial(calculate_entropy, total_ip_reads=ip_reads, total_input_reads=input_reads)
      read_counts['entropy'] = read_counts.apply(tool, axis=1)
      read_counts.to_csv(out_fn, sep="\t", index=False, header=True)
	annotated_bedtool_header = ['chrom', 'start', "stop", "name", "score", "strand", "annotation", "gene_id"]
	full_header = ["chrom", "start", "stop", "full_name", "ip_reads", "input_reads", "p_val", "chisq", "test_type",
	"enrichment", "log10_p_val", "log2_fold_change"]
	def get_full_from_annotated(fn):
	stripped_fn = ".".join(fn.split(".")[:-3])
	return stripped_fn + ".full.compressed2.bed.full"

	def calculate_entropy(row, total_ip_reads, total_input_reads):
	p_ip = float(row.ip_reads) / total_ip_reads
	p_input = float(row.input_reads) / total_input_reads

	return p_ip * np.log2(p_ip / p_input)

	def get_entropy_from_annotated(fn):
	fn = os.path.basename(fn)
	stripped_fn = ".".join(fn.split(".")[:-3])
	stripped_fn = os.path.join(out_dir, stripped_fn)
	return stripped_fn + ".full.compressed2.bed.full.entropy.bed"

	def sum_entropy(filtered_peaks, original_peaks):
	out_file = "/home/gpratt/Dropbox/EricGabe_ENCODE/enriched_peaks/" + os.path.basename(filtered_peaks) + ".entropy.bed"
	entropy = pd.read_table(get_entropy_from_annotated(original_peaks))
	filtered_peaks = pd.read_table(filtered_peaks, names=annotated_bedtool_header)

	merged_peaks = pd.merge(filtered_peaks, entropy,
	left_on=['chrom', 'start', 'stop'],
	right_on=['chrom', 'start', 'stop'])
	merged_peaks.to_csv(out_file,
	sep="\t", header=None, index=False)
	return merged_peaks.entropy.sum()

	def sum_entropy_row(row):
	#Sadly the majority of the time in this operation is opening the files, can't make it faster :(
	try:
	entropy = sum_entropy(row.filtered_moderate, row['input_norm'])
	except Exception as e:
	print e, "something went wrong"
	entropy = 0
	return entropy


	for name, row in tqdm(list(merged_data.iterrows())):
	full_fn = get_full_from_annotated(row['input_norm'])
	out_fn = os.path.join(out_dir, os.path.basename(full_fn) + ".entropy.bed")
	if os.path.exists(out_fn):
	continue

	ip_reads = row['CLIP_counts']
	input_reads = row['INPUT_counts']

	read_counts = pd.read_table(full_fn, names=full_header)
	if len(read_counts) != 0:
	tool = functools.partial(calculate_entropy, total_ip_reads=ip_reads, total_input_reads=input_reads)
	read_counts['entropy'] = read_counts.apply(tool, axis=1)
	read_counts.to_csv(out_fn, sep="\t", index=False, header=True)