Skip to content

Instantly share code, notes, and snippets.

@hurrialice
Last active April 7, 2022 02:55
Show Gist options
  • Save hurrialice/43812e5df996c2abce3dd2578cb13d58 to your computer and use it in GitHub Desktop.
Save hurrialice/43812e5df996c2abce3dd2578cb13d58 to your computer and use it in GitHub Desktop.
upset figure from given column
DF = pd.read_csv('tmpout/merged.maflite', sep = '\t')
DF.groupby(["pipeline"]).sample_id.nunique()
# # define a "MATCH"
#
# in this section we attempted to use a set of columns to define a matching
# id_cols defines a unique event
IDCOLS = [
"contig",
"startpos",
"endpos",
"vtype",
'altbase', # this is also important to get right
"cohort",
"sample_id"
]
unique_DF = DF.groupby(IDCOLS, as_index=False).agg({'pipeline': lambda x: ','.join(set(sorted(x)))})
unique_DF.pipeline.value_counts()
unique_DF.isna().sum()
DF2 = unique_DF.rename(columns = { 'pipeline' : 'called_by' })\
.merge(DF.drop(columns = ['pipeline']), how = 'left')\
.groupby(IDCOLS).first().reset_index()
from upsetplot import from_memberships
from upsetplot import plot
def get_overlap(dfa, title):
"""
dfa assumes a column "called_by". a comma separated column of multiple pipelines ("washu,getz")
"""
# upset plot
dfc = dfa.groupby('called_by').size().reset_index(name = 'count').sort_values('called_by')
plot(from_memberships(dfc.called_by.str.split(",").tolist(), dfc['count']),
sort_by = 'degree', sort_categories_by = None,
show_counts = True)
plt.title(title, y=1.08)
return(dfa)
@hurrialice
Copy link
Author

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment