evanbiederstedt/check_chrom_shuffler.py Secret

## check_chrom_shuffler.py

###
### Proof of principle
### given a pandas DataFrame whereby rows in column1 and column2 represent a genomic interval (start, end):
### if the genomic intervals overlap, take only the rows of these overlapping intervals and re-shuffle the indices of these rows
###

import numpy as np
import pandas as pd
import random

df = pd.DataFrame({"start":[20, 125, 156, 211, 227, 220, 230, 472, 4765, 20, 125, 156, 211, 227, 220, 230, 472, 4765], "end":[52, 162, 195, 250, 338, 251, 248, 515, 8988, 52, 162, 195, 250, 338, 251, 248, 515, 8988], "chrom":[2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 14, 14, 14, 14, 14, 14, 14, 14]})
### next is a garbage column; check whether any behavior with columns besides start and end
df = df[["start", "end", "chrom"]]


def combined_unions(intervals):
    ### finds overlapping intervals and takes the union, merges into one union interval
    sorted_by_lowest_interval = sorted(intervals, key=lambda tup: tup[0])
    merged_intervals = []
    for higher in sorted_by_lowest_interval:
        if not merged_intervals:
            merged_intervals.append(higher)
        else:
            lower = merged_intervals[-1]
            # check for union between 'higher' and 'lower':
            # after sorting above, by definition lower[0] <= higher[0]
            if higher[0] <= lower[1]:
                upper_bound = max(lower[1], higher[1])
                merged_intervals[-1] = (lower[0], upper_bound)  # replace by combined interval
            else:
                merged_intervals.append(higher)
    return merged_intervals


### from sklearn.utils import shuffle
### don't want extra dependency if can avoid it
def shuffler(dframe):
    return dframe.reindex(np.random.permutation(dframe.index))


def remix_overlaps(dframe):
    ### create list of tuples
    chrom_list = []   ## append each shuffled dataframe by chromosome
    for name, group in dframe.groupby("chrom"):  ## split dataframe by group
        chrom = pd.DataFrame(group)  ### each chrom has its own dataframe
        list_of_merged_intervals = combined_unions(zip(chrom["start"], chrom["end"]))
        chrom["interval_ID"] = chrom["start"].apply(lambda x: next(i for i, m in enumerate(list_of_merged_intervals) if m[0] <= x <= m[1]))
        shuffled_group = chrom.groupby("interval_ID").apply(shuffler)
        shuffled_group = shuffled_group.drop("interval_ID", axis=1) ### drop column 'interval_ID'
        shuffled_group = shuffled_group.reset_index(drop=True)
        chrom_list.append(shuffled_group)
    total_chroms = pd.concat(chrom_list)  ### probably need to shuffle
    return total_chroms

check_works = remix_overlaps(df)
print(check_works)


## gistfile1.txt

###
### Proof of principle
### given a pandas DataFrame whereby rows in column1 and column2 represent a genomic interval (start, end):
### if the genomic intervals overlap, take only the rows of these overlapping intervals and re-shuffle the indices of these rows
###

import numpy as np
import pandas as pd
import random

df = pd.DataFrame({"start":[20, 125, 156, 211, 227, 220, 230, 472, 4765, 20, 125, 156, 211, 227, 220, 230, 472, 4765], "end":[52, 162, 195, 250, 338, 251, 248, 515, 8988, 52, 162, 195, 250, 338, 251, 248, 515, 8988], "chrom":[2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 14, 14, 14, 14, 14, 14, 14, 14]})
### next is a garbage column; check whether any behavior with columns besides start and end
df = df[["start", "end", "chrom"]]


def combined_unions(intervals):
    ### finds overlapping intervals and takes the union, merges into one union interval
    sorted_by_lowest_interval = sorted(intervals, key=lambda tup: tup[0])
    merged_intervals = []
    for higher in sorted_by_lowest_interval:
        if not merged_intervals:
            merged_intervals.append(higher)
        else:
            lower = merged_intervals[-1]
            # check for union between 'higher' and 'lower':
            # after sorting above, by definition lower[0] <= higher[0]
            if higher[0] <= lower[1]:
                upper_bound = max(lower[1], higher[1])
                merged_intervals[-1] = (lower[0], upper_bound)  # replace by combined interval
            else:
                merged_intervals.append(higher)
    return merged_intervals


### from sklearn.utils import shuffle
### don't want extra dependency if can avoid it
def shuffler(dframe):
    return dframe.reindex(np.random.permutation(dframe.index))


def remix_overlaps(dframe):
    ### create list of tuples
    chrom_list = []   ## append each shuffled dataframe by chromosome
    for name, group in dframe.groupby("chrom"):  ## split dataframe by group
        chrom = pd.DataFrame(group)  ### each chrom has its own dataframe
        list_of_merged_intervals = combined_unions(zip(chrom["start"], chrom["end"]))
        chrom["interval_ID"] = chrom["start"].apply(lambda x: next(i for i, m in enumerate(list_of_merged_intervals) if m[0] <= x <= m[1]))
        shuffled_group = chrom.groupby("interval_ID").apply(shuffler)
        shuffled_group = shuffled_group.drop("interval_ID", axis=1) ### drop column 'interval_ID'
        shuffled_group = shuffled_group.reset_index(drop=True)
        chrom_list.append(shuffled_group)
    total_chroms = pd.concat(chrom_list)  ### probably need to shuffle
    return total_chroms

check_works = remix_overlaps(df)
print(check_works)

	###
	### Proof of principle
	### given a pandas DataFrame whereby rows in column1 and column2 represent a genomic interval (start, end):
	### if the genomic intervals overlap, take only the rows of these overlapping intervals and re-shuffle the indices of these rows
	###

	import numpy as np
	import pandas as pd
	import random

	df = pd.DataFrame({"start":[20, 125, 156, 211, 227, 220, 230, 472, 4765, 20, 125, 156, 211, 227, 220, 230, 472, 4765], "end":[52, 162, 195, 250, 338, 251, 248, 515, 8988, 52, 162, 195, 250, 338, 251, 248, 515, 8988], "chrom":[2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 14, 14, 14, 14, 14, 14, 14, 14]})
	### next is a garbage column; check whether any behavior with columns besides start and end
	df = df[["start", "end", "chrom"]]


	def combined_unions(intervals):
	### finds overlapping intervals and takes the union, merges into one union interval
	sorted_by_lowest_interval = sorted(intervals, key=lambda tup: tup[0])
	merged_intervals = []
	for higher in sorted_by_lowest_interval:
	if not merged_intervals:
	merged_intervals.append(higher)
	else:
	lower = merged_intervals[-1]
	# check for union between 'higher' and 'lower':
	# after sorting above, by definition lower[0] <= higher[0]
	if higher[0] <= lower[1]:
	upper_bound = max(lower[1], higher[1])
	merged_intervals[-1] = (lower[0], upper_bound) # replace by combined interval
	else:
	merged_intervals.append(higher)
	return merged_intervals



	### from sklearn.utils import shuffle
	### don't want extra dependency if can avoid it
	def shuffler(dframe):
	return dframe.reindex(np.random.permutation(dframe.index))


	def remix_overlaps(dframe):
	### create list of tuples
	chrom_list = [] ## append each shuffled dataframe by chromosome
	for name, group in dframe.groupby("chrom"): ## split dataframe by group
	chrom = pd.DataFrame(group) ### each chrom has its own dataframe
	list_of_merged_intervals = combined_unions(zip(chrom["start"], chrom["end"]))
	chrom["interval_ID"] = chrom["start"].apply(lambda x: next(i for i, m in enumerate(list_of_merged_intervals) if m[0] <= x <= m[1]))
	shuffled_group = chrom.groupby("interval_ID").apply(shuffler)
	shuffled_group = shuffled_group.drop("interval_ID", axis=1) ### drop column 'interval_ID'
	shuffled_group = shuffled_group.reset_index(drop=True)
	chrom_list.append(shuffled_group)
	total_chroms = pd.concat(chrom_list) ### probably need to shuffle
	return total_chroms

	check_works = remix_overlaps(df)
	print(check_works)