-
-
Save evanbiederstedt/73e67c9f3aceca4d656c372eb05a9991 to your computer and use it in GitHub Desktop.
shuffling overlapping intervals by pandas DataFrame rows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### | |
### Proof of principle | |
### given a pandas DataFrame whereby rows in column1 and column2 represent a genomic interval (start, end): | |
### if the genomic intervals overlap, take only the rows of these overlapping intervals and re-shuffle the indices of these rows | |
### | |
import numpy as np | |
import pandas as pd | |
import random | |
df = pd.DataFrame({"start":[20, 125, 156, 211, 227, 220, 230, 472, 4765, 20, 125, 156, 211, 227, 220, 230, 472, 4765], "end":[52, 162, 195, 250, 338, 251, 248, 515, 8988, 52, 162, 195, 250, 338, 251, 248, 515, 8988], "chrom":[2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 14, 14, 14, 14, 14, 14, 14, 14]}) | |
### next is a garbage column; check whether any behavior with columns besides start and end | |
df = df[["start", "end", "chrom"]] | |
def combined_unions(intervals): | |
### finds overlapping intervals and takes the union, merges into one union interval | |
sorted_by_lowest_interval = sorted(intervals, key=lambda tup: tup[0]) | |
merged_intervals = [] | |
for higher in sorted_by_lowest_interval: | |
if not merged_intervals: | |
merged_intervals.append(higher) | |
else: | |
lower = merged_intervals[-1] | |
# check for union between 'higher' and 'lower': | |
# after sorting above, by definition lower[0] <= higher[0] | |
if higher[0] <= lower[1]: | |
upper_bound = max(lower[1], higher[1]) | |
merged_intervals[-1] = (lower[0], upper_bound) # replace by combined interval | |
else: | |
merged_intervals.append(higher) | |
return merged_intervals | |
### from sklearn.utils import shuffle | |
### don't want extra dependency if can avoid it | |
def shuffler(dframe): | |
return dframe.reindex(np.random.permutation(dframe.index)) | |
def remix_overlaps(dframe): | |
### create list of tuples | |
chrom_list = [] ## append each shuffled dataframe by chromosome | |
for name, group in dframe.groupby("chrom"): ## split dataframe by group | |
chrom = pd.DataFrame(group) ### each chrom has its own dataframe | |
list_of_merged_intervals = combined_unions(zip(chrom["start"], chrom["end"])) | |
chrom["interval_ID"] = chrom["start"].apply(lambda x: next(i for i, m in enumerate(list_of_merged_intervals) if m[0] <= x <= m[1])) | |
shuffled_group = chrom.groupby("interval_ID").apply(shuffler) | |
shuffled_group = shuffled_group.drop("interval_ID", axis=1) ### drop column 'interval_ID' | |
shuffled_group = shuffled_group.reset_index(drop=True) | |
chrom_list.append(shuffled_group) | |
total_chroms = pd.concat(chrom_list) ### probably need to shuffle | |
return total_chroms | |
check_works = remix_overlaps(df) | |
print(check_works) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### | |
### Proof of principle | |
### given a pandas DataFrame whereby rows in column1 and column2 represent a genomic interval (start, end): | |
### if the genomic intervals overlap, take only the rows of these overlapping intervals and re-shuffle the indices of these rows | |
### | |
import numpy as np | |
import pandas as pd | |
import random | |
df = pd.DataFrame({"start":[20, 125, 156, 211, 227, 220, 230, 472, 4765, 20, 125, 156, 211, 227, 220, 230, 472, 4765], "end":[52, 162, 195, 250, 338, 251, 248, 515, 8988, 52, 162, 195, 250, 338, 251, 248, 515, 8988], "chrom":[2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 14, 14, 14, 14, 14, 14, 14, 14]}) | |
### next is a garbage column; check whether any behavior with columns besides start and end | |
df = df[["start", "end", "chrom"]] | |
def combined_unions(intervals): | |
### finds overlapping intervals and takes the union, merges into one union interval | |
sorted_by_lowest_interval = sorted(intervals, key=lambda tup: tup[0]) | |
merged_intervals = [] | |
for higher in sorted_by_lowest_interval: | |
if not merged_intervals: | |
merged_intervals.append(higher) | |
else: | |
lower = merged_intervals[-1] | |
# check for union between 'higher' and 'lower': | |
# after sorting above, by definition lower[0] <= higher[0] | |
if higher[0] <= lower[1]: | |
upper_bound = max(lower[1], higher[1]) | |
merged_intervals[-1] = (lower[0], upper_bound) # replace by combined interval | |
else: | |
merged_intervals.append(higher) | |
return merged_intervals | |
### from sklearn.utils import shuffle | |
### don't want extra dependency if can avoid it | |
def shuffler(dframe): | |
return dframe.reindex(np.random.permutation(dframe.index)) | |
def remix_overlaps(dframe): | |
### create list of tuples | |
chrom_list = [] ## append each shuffled dataframe by chromosome | |
for name, group in dframe.groupby("chrom"): ## split dataframe by group | |
chrom = pd.DataFrame(group) ### each chrom has its own dataframe | |
list_of_merged_intervals = combined_unions(zip(chrom["start"], chrom["end"])) | |
chrom["interval_ID"] = chrom["start"].apply(lambda x: next(i for i, m in enumerate(list_of_merged_intervals) if m[0] <= x <= m[1])) | |
shuffled_group = chrom.groupby("interval_ID").apply(shuffler) | |
shuffled_group = shuffled_group.drop("interval_ID", axis=1) ### drop column 'interval_ID' | |
shuffled_group = shuffled_group.reset_index(drop=True) | |
chrom_list.append(shuffled_group) | |
total_chroms = pd.concat(chrom_list) ### probably need to shuffle | |
return total_chroms | |
check_works = remix_overlaps(df) | |
print(check_works) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment