Skip to content

Instantly share code, notes, and snippets.

@evanbiederstedt
Created June 30, 2017 20:13
Show Gist options
  • Save evanbiederstedt/73e67c9f3aceca4d656c372eb05a9991 to your computer and use it in GitHub Desktop.
Save evanbiederstedt/73e67c9f3aceca4d656c372eb05a9991 to your computer and use it in GitHub Desktop.
shuffling overlapping intervals by pandas DataFrame rows
###
### Proof of principle
### given a pandas DataFrame whereby rows in column1 and column2 represent a genomic interval (start, end):
### if the genomic intervals overlap, take only the rows of these overlapping intervals and re-shuffle the indices of these rows
###
import numpy as np
import pandas as pd
import random
df = pd.DataFrame({"start":[20, 125, 156, 211, 227, 220, 230, 472, 4765, 20, 125, 156, 211, 227, 220, 230, 472, 4765], "end":[52, 162, 195, 250, 338, 251, 248, 515, 8988, 52, 162, 195, 250, 338, 251, 248, 515, 8988], "chrom":[2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 14, 14, 14, 14, 14, 14, 14, 14]})
### next is a garbage column; check whether any behavior with columns besides start and end
df = df[["start", "end", "chrom"]]
def combined_unions(intervals):
### finds overlapping intervals and takes the union, merges into one union interval
sorted_by_lowest_interval = sorted(intervals, key=lambda tup: tup[0])
merged_intervals = []
for higher in sorted_by_lowest_interval:
if not merged_intervals:
merged_intervals.append(higher)
else:
lower = merged_intervals[-1]
# check for union between 'higher' and 'lower':
# after sorting above, by definition lower[0] <= higher[0]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
merged_intervals[-1] = (lower[0], upper_bound) # replace by combined interval
else:
merged_intervals.append(higher)
return merged_intervals
### from sklearn.utils import shuffle
### don't want extra dependency if can avoid it
def shuffler(dframe):
return dframe.reindex(np.random.permutation(dframe.index))
def remix_overlaps(dframe):
### create list of tuples
chrom_list = [] ## append each shuffled dataframe by chromosome
for name, group in dframe.groupby("chrom"): ## split dataframe by group
chrom = pd.DataFrame(group) ### each chrom has its own dataframe
list_of_merged_intervals = combined_unions(zip(chrom["start"], chrom["end"]))
chrom["interval_ID"] = chrom["start"].apply(lambda x: next(i for i, m in enumerate(list_of_merged_intervals) if m[0] <= x <= m[1]))
shuffled_group = chrom.groupby("interval_ID").apply(shuffler)
shuffled_group = shuffled_group.drop("interval_ID", axis=1) ### drop column 'interval_ID'
shuffled_group = shuffled_group.reset_index(drop=True)
chrom_list.append(shuffled_group)
total_chroms = pd.concat(chrom_list) ### probably need to shuffle
return total_chroms
check_works = remix_overlaps(df)
print(check_works)
###
### Proof of principle
### given a pandas DataFrame whereby rows in column1 and column2 represent a genomic interval (start, end):
### if the genomic intervals overlap, take only the rows of these overlapping intervals and re-shuffle the indices of these rows
###
import numpy as np
import pandas as pd
import random
df = pd.DataFrame({"start":[20, 125, 156, 211, 227, 220, 230, 472, 4765, 20, 125, 156, 211, 227, 220, 230, 472, 4765], "end":[52, 162, 195, 250, 338, 251, 248, 515, 8988, 52, 162, 195, 250, 338, 251, 248, 515, 8988], "chrom":[2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 14, 14, 14, 14, 14, 14, 14, 14]})
### next is a garbage column; check whether any behavior with columns besides start and end
df = df[["start", "end", "chrom"]]
def combined_unions(intervals):
### finds overlapping intervals and takes the union, merges into one union interval
sorted_by_lowest_interval = sorted(intervals, key=lambda tup: tup[0])
merged_intervals = []
for higher in sorted_by_lowest_interval:
if not merged_intervals:
merged_intervals.append(higher)
else:
lower = merged_intervals[-1]
# check for union between 'higher' and 'lower':
# after sorting above, by definition lower[0] <= higher[0]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
merged_intervals[-1] = (lower[0], upper_bound) # replace by combined interval
else:
merged_intervals.append(higher)
return merged_intervals
### from sklearn.utils import shuffle
### don't want extra dependency if can avoid it
def shuffler(dframe):
return dframe.reindex(np.random.permutation(dframe.index))
def remix_overlaps(dframe):
### create list of tuples
chrom_list = [] ## append each shuffled dataframe by chromosome
for name, group in dframe.groupby("chrom"): ## split dataframe by group
chrom = pd.DataFrame(group) ### each chrom has its own dataframe
list_of_merged_intervals = combined_unions(zip(chrom["start"], chrom["end"]))
chrom["interval_ID"] = chrom["start"].apply(lambda x: next(i for i, m in enumerate(list_of_merged_intervals) if m[0] <= x <= m[1]))
shuffled_group = chrom.groupby("interval_ID").apply(shuffler)
shuffled_group = shuffled_group.drop("interval_ID", axis=1) ### drop column 'interval_ID'
shuffled_group = shuffled_group.reset_index(drop=True)
chrom_list.append(shuffled_group)
total_chroms = pd.concat(chrom_list) ### probably need to shuffle
return total_chroms
check_works = remix_overlaps(df)
print(check_works)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment