-
-
Save hurrialice/43812e5df996c2abce3dd2578cb13d58 to your computer and use it in GitHub Desktop.
upset figure from given column
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DF = pd.read_csv('tmpout/merged.maflite', sep = '\t') | |
DF.groupby(["pipeline"]).sample_id.nunique() | |
# # define a "MATCH" | |
# | |
# in this section we attempted to use a set of columns to define a matching | |
# id_cols defines a unique event | |
IDCOLS = [ | |
"contig", | |
"startpos", | |
"endpos", | |
"vtype", | |
'altbase', # this is also important to get right | |
"cohort", | |
"sample_id" | |
] | |
unique_DF = DF.groupby(IDCOLS, as_index=False).agg({'pipeline': lambda x: ','.join(set(sorted(x)))}) | |
unique_DF.pipeline.value_counts() | |
unique_DF.isna().sum() | |
DF2 = unique_DF.rename(columns = { 'pipeline' : 'called_by' })\ | |
.merge(DF.drop(columns = ['pipeline']), how = 'left')\ | |
.groupby(IDCOLS).first().reset_index() | |
from upsetplot import from_memberships | |
from upsetplot import plot | |
def get_overlap(dfa, title): | |
""" | |
dfa assumes a column "called_by". a comma separated column of multiple pipelines ("washu,getz") | |
""" | |
# upset plot | |
dfc = dfa.groupby('called_by').size().reset_index(name = 'count').sort_values('called_by') | |
plot(from_memberships(dfc.called_by.str.split(",").tolist(), dfc['count']), | |
sort_by = 'degree', sort_categories_by = None, | |
show_counts = True) | |
plt.title(title, y=1.08) | |
return(dfa) |
Author
hurrialice
commented
Apr 6, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment