Skip to content

Instantly share code, notes, and snippets.

@jluebeck
Last active September 28, 2023 18:12
Show Gist options
  • Save jluebeck/f3aa9247f845f574efc4f0ac8860dc0b to your computer and use it in GitHub Desktop.
Save jluebeck/f3aa9247f845f574efc4f0ac8860dc0b to your computer and use it in GitHub Desktop.
istat_overlap_karyogram_plot.py
#!/usr/bin/env python
'''
REQUIREMENTS:
'AA_DATA_REPO' environment variable: https://github.com/AmpliconSuite/AmpliconSuite-pipeline
'CV_SRC' (cycleviz) environent variable: https://github.com/AmpliconSuite/CycleViz
matplotlib
numpy
intervaltree
# please see comments below for things that must be changed to use the script.
'''
import argparse
from collections import defaultdict
import copy
import os
import sys
from ast import literal_eval as make_tuple
from intervaltree import IntervalTree
import matplotlib
matplotlib.use('Agg') # this import must happen immediately after importing matplotlib
from matplotlib import pyplot as plt
from matplotlib import rcParams
from matplotlib.collections import LineCollection
from matplotlib.collections import PatchCollection
from matplotlib.font_manager import FontProperties
from matplotlib.patches import FancyBboxPatch
import matplotlib.patches as mpatches
import matplotlib.transforms as mtransforms
from matplotlib.path import Path
import numpy as np
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
matplotlib.rcParams['pdf.fonttype'] = 42
# replace as needed for GRCh38 or other ref
gfile = os.environ['AA_DATA_REPO'] + "/hg19/human_hg19_september_2011/Genes_July_2010_hg19.gff"
structure_file = os.environ["CV_SRC"] + "/resources/hg19_structure.bed"
# this would be a file of oncogenes (one gene per line)
combined_set = set()
with open("../../annotations/combined_oncogene_list.txt") as infile:
for line in infile:
fields = line.rstrip().rsplit("\t")
combined_set.add(fields[0])
print(len(combined_set))
# this function maps genes to locations
def parse_genes(gene_file, gset):
print("reading " + gene_file)
t = defaultdict(IntervalTree)
seenNames = set()
with open(gene_file) as infile:
for line in infile:
if line.startswith("#"):
continue
fields = line.rstrip().split()
if not fields:
continue
chrom, s, e, strand = fields[0], int(fields[3]), int(fields[4]), fields[6]
# parse the line and get the name
propFields = {x.split("=")[0]: x.split("=")[1] for x in fields[-1].rstrip(";").split(";")}
gname = propFields["Name"]
is_other_feature = (gname.startswith("LOC") or gname.startswith("LINC") or gname.startswith("MIR"))
if gname not in seenNames and gname in gset:
seenNames.add(gname)
t[chrom].addi(s,e, gname)
print("read " + str(len(seenNames)) + " genes\n")
return t
gtree = parse_genes(gfile, combined_set)
# this code draws the karyotype plots.
chrlist = []
chrlens = []
with open(structure_file) as infile:
for line in infile:
fields = line.rsplit("\t")
chrlist.append(fields[0])
chrlens.append(int(fields[2]))
chr_total_len = sum(chrlens)
spacing_prop = 0.01
n_entries = len(chrlist)
spacing = spacing_prop*chr_total_len
total_spacing = spacing*n_entries
plot_len = chr_total_len + total_spacing
height = 0.02*plot_len
# place each entry
chr_placements = [0]
for x in chrlens[:-1]:
chr_placements.append(chr_placements[-1] + spacing + x)
print(list(zip(chrlist,chr_placements)))
chrom_to_items = defaultdict(list)
with open(os.environ["CV_SRC"] + "/resources/cytoBand_hg19_colored.bed") as infile:
for line in infile:
fields = line.rstrip().rsplit("\t")
chrom_to_items[fields[0]].append((int(fields[1]), int(fields[2]), fields[4]))
# read the ecdna / oncogene / overlap regions
# this is a bed file of all the ecDNA regions
ecregiond = defaultdict(list)
with open("../../classification/combined_merged_flattened_ecdna_intervals.bed") as infile:
for line in infile:
fields = line.rstrip().rsplit()
ecregiond[fields[0]].append((int(fields[1]), int(fields[2])))
# this is a bed file of all the oncogene genome regions
oncogened = defaultdict(list)
with open("../../annotations/frankel_paulson_stachler_intervals.bed") as infile:
for line in infile:
fields = line.rstrip().rsplit()
oncogened[fields[0]].append((int(fields[1]), int(fields[2])))
# this is a bed file of the overlap between ecDNA and oncogene regions
overlapd = defaultdict(list)
with open("../../ecDNA_fsp_overlap.bed") as infile:
for line in infile:
fields = line.rstrip().rsplit()
overlapd[fields[0]].append((int(fields[1]), int(fields[2])))
def add_fancy_patch_around(ax, bb, **kwargs):
fancy = FancyBboxPatch((bb.xmin, bb.ymin), bb.width, bb.height,
fc=(0, 0, 0, 0), ec=(0.2, 0.2, 0.2, 1),
**kwargs)
ax.add_patch(fancy)
return fancy
'''
This code below draws the plots
'''
fig, ax = plt.subplots(figsize=(8, 3.25))
# draw karyogram
for chrom, s,l in zip(chrlist, chr_placements, chrlens):
bb = mtransforms.Bbox([[s, 0], [s+l, height]])
fancy = add_fancy_patch_around(ax, bb, boxstyle="round,rounding_size=12000000", linewidth=0.6)
ax.text(s + l/2, height*1.2, chrom.lstrip("chr"), ha='center', fontsize=6)
for chrom, s in zip(chrlist, chr_placements):
bandlist = chrom_to_items[chrom]
for b in bandlist:
ctup = make_tuple("".join(b[2].split())) #index | other_value | color_tuple
if any([x > 1 for x in ctup]):
ctup = tuple([x/255.0 for x in ctup])
offset = s + b[0]
ax.add_patch(mpatches.Rectangle((offset,height*0.04), b[1]-b[0], height*0.915,
facecolor=ctup, edgecolor='none', zorder=-1))
# draw ecdna regions
fheight = height*1.1
for chrom, s in zip(chrlist, chr_placements):
bandlist = ecregiond[chrom]
for b in bandlist:
offset = s + b[0]
ax.add_patch(mpatches.Rectangle((offset,-1.5*height), b[1]-b[0], fheight,
facecolor='r', edgecolor='r', linewidth=0.2, zorder=-1))
# for chrom, s in zip(chrlist, chr_placements):
# bandlist = ongened[chrom]
# for b in bandlist:
# offset = s + b[0]
# ax.add_patch(mpatches.Rectangle((offset,(-2*height - fheight)), b[1]-b[0], fheight, facecolor='blue',
# edgecolor='blue', linewidth=0.1, zorder=-1))
# draw oncogene regions
for chrom, s in zip(chrlist, chr_placements):
bandlist = oncogened[chrom]
for b in bandlist:
offset = s + b[0]
ax.add_patch(mpatches.Rectangle((offset,(-2*height - fheight)), b[1]-b[0], fheight, facecolor='blue',
edgecolor='blue', linewidth=0.2, zorder=-1))
# for chrom, s in zip(chrlist, chr_placements):
# bandlist = all_ec_overlap[chrom]
# for b in bandlist:
# offset = s + b[0]
# ax.add_patch(mpatches.Rectangle((offset,(-2.5*height - 2*fheight)), b[1]-b[0], fheight, facecolor='indigo',
# edgecolor='indigo', linewidth=0.1, zorder=-1))
# draw overlap regoins
for chrom, s in zip(chrlist, chr_placements):
bandlist = overlapd[chrom]
for b in bandlist:
offset = s + b[0]
ax.add_patch(mpatches.Rectangle((offset,(-2.5*height - 2*fheight)), b[1]-b[0], fheight, facecolor='purple',
edgecolor='purple', linewidth=0.2, zorder=-1))
ax.set_aspect(1.0)
plt.axis('off')
ax.set_xlim(-spacing, plot_len+spacing)
ax.set_ylim(-5*height, height*1.2)
plt.savefig("linear_ecdna_overlaps.png",bbox_inches='tight', dpi=300)
plt.savefig("linear_ecdna_overlaps.pdf", bbox_inches='tight')
'''
The code below will draw this plot for one chromosome only (chr17 by default)
'''
chrlist = []
chrlens = []
# change to hg38 as needed
with open(structure_file) as infile:
for line in infile:
fields = line.rsplit("\t")
if fields[0] != "chr17":
continue
chrlist.append(fields[0])
chrlens.append(int(fields[2]))
chr_total_len = chrlens[chrlist.index('chr17')]
spacing_prop = 0.01
n_entries = 1
spacing = spacing_prop*chr_total_len
total_spacing = spacing*n_entries
plot_len = chr_total_len + total_spacing
height = 0.02*plot_len
# place each entry
chr_placements = [0]
print(list(zip(chrlist,chr_placements)))
fig, ax = plt.subplots(figsize=(8, 4.25))
# plt.subplots_adjust(bottom=0.15, wspace=0.05)
for chrom, s, l in zip(chrlist, chr_placements, chrlens):
bb = mtransforms.Bbox([[s, 0], [s+l, height]])
fancy = add_fancy_patch_around(ax, bb, boxstyle="round,rounding_size=800000", linewidth=0.6)
ax.text(s + l/2, height*1.2, chrom.lstrip("chr"), ha='center', fontsize=6)
for chrom, s in zip(chrlist, chr_placements):
bandlist = chrom_to_items[chrom]
for b in bandlist:
ctup = make_tuple("".join(b[2].split())) #index | other_value | color_tuple
if any([x > 1 for x in ctup]):
ctup = tuple([x/255.0 for x in ctup])
offset = s + b[0]
ax.add_patch(mpatches.Rectangle((offset,height*0.04), b[1]-b[0], height*0.915,
facecolor=ctup, edgecolor='none', zorder=-1))
# ecdna regions
fheight = height*1.1
for chrom, s in zip(chrlist, chr_placements):
bandlist = ecregiond[chrom]
for b in bandlist:
offset = s + b[0]
ax.add_patch(mpatches.Rectangle((offset,-1.5*height), b[1]-b[0], fheight,
facecolor='r', edgecolor='r', linewidth=0.2, zorder=-1))
for chrom, s in zip(chrlist, chr_placements):
bandlist = oncogened[chrom]
for b in bandlist:
offset = s + b[0]
ax.add_patch(mpatches.Rectangle((offset,(-2*height - fheight)), b[1]-b[0], fheight, facecolor='blue',
edgecolor='blue', linewidth=0.2, zorder=-1))
genes = sorted([x.data for x in gtree[chrom][b[0]:b[1]]])
print(genes)
print("")
if len(genes) > 1:
genestring = "\n".join(genes)
else:
genestring = genes[0]
mult=3
# can remove, this just spreads some of the names for readability
ha='center'
if genestring == "ERBB2":
# mult=3.5
ha='center'
elif genestring == "CDK12":
ha='right'
ax.text(offset + (b[1]-b[0])/2.0, (-3.5*height - mult*fheight), genestring, ha=ha, fontsize=5,
rotation=45, va='bottom')
for chrom, s in zip(chrlist, chr_placements):
bandlist = overlapd[chrom]
for b in bandlist:
offset = s + b[0]
ax.add_patch(mpatches.Rectangle((offset,(-2.5*height - 2*fheight)), b[1]-b[0], fheight, facecolor='purple',
edgecolor='purple', linewidth=0.2, zorder=-1))
ax.set_aspect(1.0)
plt.axis('off')
ax.set_xlim(-spacing, plot_len+spacing)
ax.set_ylim(-5*height, height*1.2)
plt.savefig("chr17_linear_ecdna_overlaps.png",bbox_inches='tight', dpi=300)
plt.savefig("chr17_linear_ecdna_overlaps.pdf", bbox_inches='tight')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment