Last active
September 28, 2023 18:12
-
-
Save jluebeck/f3aa9247f845f574efc4f0ac8860dc0b to your computer and use it in GitHub Desktop.
istat_overlap_karyogram_plot.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
REQUIREMENTS: | |
'AA_DATA_REPO' environment variable: https://github.com/AmpliconSuite/AmpliconSuite-pipeline | |
'CV_SRC' (cycleviz) environent variable: https://github.com/AmpliconSuite/CycleViz | |
matplotlib | |
numpy | |
intervaltree | |
# please see comments below for things that must be changed to use the script. | |
''' | |
import argparse | |
from collections import defaultdict | |
import copy | |
import os | |
import sys | |
from ast import literal_eval as make_tuple | |
from intervaltree import IntervalTree | |
import matplotlib | |
matplotlib.use('Agg') # this import must happen immediately after importing matplotlib | |
from matplotlib import pyplot as plt | |
from matplotlib import rcParams | |
from matplotlib.collections import LineCollection | |
from matplotlib.collections import PatchCollection | |
from matplotlib.font_manager import FontProperties | |
from matplotlib.patches import FancyBboxPatch | |
import matplotlib.patches as mpatches | |
import matplotlib.transforms as mtransforms | |
from matplotlib.path import Path | |
import numpy as np | |
rcParams['font.family'] = 'sans-serif' | |
rcParams['font.sans-serif'] = ['Arial'] | |
matplotlib.rcParams['pdf.fonttype'] = 42 | |
# replace as needed for GRCh38 or other ref | |
gfile = os.environ['AA_DATA_REPO'] + "/hg19/human_hg19_september_2011/Genes_July_2010_hg19.gff" | |
structure_file = os.environ["CV_SRC"] + "/resources/hg19_structure.bed" | |
# this would be a file of oncogenes (one gene per line) | |
combined_set = set() | |
with open("../../annotations/combined_oncogene_list.txt") as infile: | |
for line in infile: | |
fields = line.rstrip().rsplit("\t") | |
combined_set.add(fields[0]) | |
print(len(combined_set)) | |
# this function maps genes to locations | |
def parse_genes(gene_file, gset): | |
print("reading " + gene_file) | |
t = defaultdict(IntervalTree) | |
seenNames = set() | |
with open(gene_file) as infile: | |
for line in infile: | |
if line.startswith("#"): | |
continue | |
fields = line.rstrip().split() | |
if not fields: | |
continue | |
chrom, s, e, strand = fields[0], int(fields[3]), int(fields[4]), fields[6] | |
# parse the line and get the name | |
propFields = {x.split("=")[0]: x.split("=")[1] for x in fields[-1].rstrip(";").split(";")} | |
gname = propFields["Name"] | |
is_other_feature = (gname.startswith("LOC") or gname.startswith("LINC") or gname.startswith("MIR")) | |
if gname not in seenNames and gname in gset: | |
seenNames.add(gname) | |
t[chrom].addi(s,e, gname) | |
print("read " + str(len(seenNames)) + " genes\n") | |
return t | |
gtree = parse_genes(gfile, combined_set) | |
# this code draws the karyotype plots. | |
chrlist = [] | |
chrlens = [] | |
with open(structure_file) as infile: | |
for line in infile: | |
fields = line.rsplit("\t") | |
chrlist.append(fields[0]) | |
chrlens.append(int(fields[2])) | |
chr_total_len = sum(chrlens) | |
spacing_prop = 0.01 | |
n_entries = len(chrlist) | |
spacing = spacing_prop*chr_total_len | |
total_spacing = spacing*n_entries | |
plot_len = chr_total_len + total_spacing | |
height = 0.02*plot_len | |
# place each entry | |
chr_placements = [0] | |
for x in chrlens[:-1]: | |
chr_placements.append(chr_placements[-1] + spacing + x) | |
print(list(zip(chrlist,chr_placements))) | |
chrom_to_items = defaultdict(list) | |
with open(os.environ["CV_SRC"] + "/resources/cytoBand_hg19_colored.bed") as infile: | |
for line in infile: | |
fields = line.rstrip().rsplit("\t") | |
chrom_to_items[fields[0]].append((int(fields[1]), int(fields[2]), fields[4])) | |
# read the ecdna / oncogene / overlap regions | |
# this is a bed file of all the ecDNA regions | |
ecregiond = defaultdict(list) | |
with open("../../classification/combined_merged_flattened_ecdna_intervals.bed") as infile: | |
for line in infile: | |
fields = line.rstrip().rsplit() | |
ecregiond[fields[0]].append((int(fields[1]), int(fields[2]))) | |
# this is a bed file of all the oncogene genome regions | |
oncogened = defaultdict(list) | |
with open("../../annotations/frankel_paulson_stachler_intervals.bed") as infile: | |
for line in infile: | |
fields = line.rstrip().rsplit() | |
oncogened[fields[0]].append((int(fields[1]), int(fields[2]))) | |
# this is a bed file of the overlap between ecDNA and oncogene regions | |
overlapd = defaultdict(list) | |
with open("../../ecDNA_fsp_overlap.bed") as infile: | |
for line in infile: | |
fields = line.rstrip().rsplit() | |
overlapd[fields[0]].append((int(fields[1]), int(fields[2]))) | |
def add_fancy_patch_around(ax, bb, **kwargs): | |
fancy = FancyBboxPatch((bb.xmin, bb.ymin), bb.width, bb.height, | |
fc=(0, 0, 0, 0), ec=(0.2, 0.2, 0.2, 1), | |
**kwargs) | |
ax.add_patch(fancy) | |
return fancy | |
''' | |
This code below draws the plots | |
''' | |
fig, ax = plt.subplots(figsize=(8, 3.25)) | |
# draw karyogram | |
for chrom, s,l in zip(chrlist, chr_placements, chrlens): | |
bb = mtransforms.Bbox([[s, 0], [s+l, height]]) | |
fancy = add_fancy_patch_around(ax, bb, boxstyle="round,rounding_size=12000000", linewidth=0.6) | |
ax.text(s + l/2, height*1.2, chrom.lstrip("chr"), ha='center', fontsize=6) | |
for chrom, s in zip(chrlist, chr_placements): | |
bandlist = chrom_to_items[chrom] | |
for b in bandlist: | |
ctup = make_tuple("".join(b[2].split())) #index | other_value | color_tuple | |
if any([x > 1 for x in ctup]): | |
ctup = tuple([x/255.0 for x in ctup]) | |
offset = s + b[0] | |
ax.add_patch(mpatches.Rectangle((offset,height*0.04), b[1]-b[0], height*0.915, | |
facecolor=ctup, edgecolor='none', zorder=-1)) | |
# draw ecdna regions | |
fheight = height*1.1 | |
for chrom, s in zip(chrlist, chr_placements): | |
bandlist = ecregiond[chrom] | |
for b in bandlist: | |
offset = s + b[0] | |
ax.add_patch(mpatches.Rectangle((offset,-1.5*height), b[1]-b[0], fheight, | |
facecolor='r', edgecolor='r', linewidth=0.2, zorder=-1)) | |
# for chrom, s in zip(chrlist, chr_placements): | |
# bandlist = ongened[chrom] | |
# for b in bandlist: | |
# offset = s + b[0] | |
# ax.add_patch(mpatches.Rectangle((offset,(-2*height - fheight)), b[1]-b[0], fheight, facecolor='blue', | |
# edgecolor='blue', linewidth=0.1, zorder=-1)) | |
# draw oncogene regions | |
for chrom, s in zip(chrlist, chr_placements): | |
bandlist = oncogened[chrom] | |
for b in bandlist: | |
offset = s + b[0] | |
ax.add_patch(mpatches.Rectangle((offset,(-2*height - fheight)), b[1]-b[0], fheight, facecolor='blue', | |
edgecolor='blue', linewidth=0.2, zorder=-1)) | |
# for chrom, s in zip(chrlist, chr_placements): | |
# bandlist = all_ec_overlap[chrom] | |
# for b in bandlist: | |
# offset = s + b[0] | |
# ax.add_patch(mpatches.Rectangle((offset,(-2.5*height - 2*fheight)), b[1]-b[0], fheight, facecolor='indigo', | |
# edgecolor='indigo', linewidth=0.1, zorder=-1)) | |
# draw overlap regoins | |
for chrom, s in zip(chrlist, chr_placements): | |
bandlist = overlapd[chrom] | |
for b in bandlist: | |
offset = s + b[0] | |
ax.add_patch(mpatches.Rectangle((offset,(-2.5*height - 2*fheight)), b[1]-b[0], fheight, facecolor='purple', | |
edgecolor='purple', linewidth=0.2, zorder=-1)) | |
ax.set_aspect(1.0) | |
plt.axis('off') | |
ax.set_xlim(-spacing, plot_len+spacing) | |
ax.set_ylim(-5*height, height*1.2) | |
plt.savefig("linear_ecdna_overlaps.png",bbox_inches='tight', dpi=300) | |
plt.savefig("linear_ecdna_overlaps.pdf", bbox_inches='tight') | |
''' | |
The code below will draw this plot for one chromosome only (chr17 by default) | |
''' | |
chrlist = [] | |
chrlens = [] | |
# change to hg38 as needed | |
with open(structure_file) as infile: | |
for line in infile: | |
fields = line.rsplit("\t") | |
if fields[0] != "chr17": | |
continue | |
chrlist.append(fields[0]) | |
chrlens.append(int(fields[2])) | |
chr_total_len = chrlens[chrlist.index('chr17')] | |
spacing_prop = 0.01 | |
n_entries = 1 | |
spacing = spacing_prop*chr_total_len | |
total_spacing = spacing*n_entries | |
plot_len = chr_total_len + total_spacing | |
height = 0.02*plot_len | |
# place each entry | |
chr_placements = [0] | |
print(list(zip(chrlist,chr_placements))) | |
fig, ax = plt.subplots(figsize=(8, 4.25)) | |
# plt.subplots_adjust(bottom=0.15, wspace=0.05) | |
for chrom, s, l in zip(chrlist, chr_placements, chrlens): | |
bb = mtransforms.Bbox([[s, 0], [s+l, height]]) | |
fancy = add_fancy_patch_around(ax, bb, boxstyle="round,rounding_size=800000", linewidth=0.6) | |
ax.text(s + l/2, height*1.2, chrom.lstrip("chr"), ha='center', fontsize=6) | |
for chrom, s in zip(chrlist, chr_placements): | |
bandlist = chrom_to_items[chrom] | |
for b in bandlist: | |
ctup = make_tuple("".join(b[2].split())) #index | other_value | color_tuple | |
if any([x > 1 for x in ctup]): | |
ctup = tuple([x/255.0 for x in ctup]) | |
offset = s + b[0] | |
ax.add_patch(mpatches.Rectangle((offset,height*0.04), b[1]-b[0], height*0.915, | |
facecolor=ctup, edgecolor='none', zorder=-1)) | |
# ecdna regions | |
fheight = height*1.1 | |
for chrom, s in zip(chrlist, chr_placements): | |
bandlist = ecregiond[chrom] | |
for b in bandlist: | |
offset = s + b[0] | |
ax.add_patch(mpatches.Rectangle((offset,-1.5*height), b[1]-b[0], fheight, | |
facecolor='r', edgecolor='r', linewidth=0.2, zorder=-1)) | |
for chrom, s in zip(chrlist, chr_placements): | |
bandlist = oncogened[chrom] | |
for b in bandlist: | |
offset = s + b[0] | |
ax.add_patch(mpatches.Rectangle((offset,(-2*height - fheight)), b[1]-b[0], fheight, facecolor='blue', | |
edgecolor='blue', linewidth=0.2, zorder=-1)) | |
genes = sorted([x.data for x in gtree[chrom][b[0]:b[1]]]) | |
print(genes) | |
print("") | |
if len(genes) > 1: | |
genestring = "\n".join(genes) | |
else: | |
genestring = genes[0] | |
mult=3 | |
# can remove, this just spreads some of the names for readability | |
ha='center' | |
if genestring == "ERBB2": | |
# mult=3.5 | |
ha='center' | |
elif genestring == "CDK12": | |
ha='right' | |
ax.text(offset + (b[1]-b[0])/2.0, (-3.5*height - mult*fheight), genestring, ha=ha, fontsize=5, | |
rotation=45, va='bottom') | |
for chrom, s in zip(chrlist, chr_placements): | |
bandlist = overlapd[chrom] | |
for b in bandlist: | |
offset = s + b[0] | |
ax.add_patch(mpatches.Rectangle((offset,(-2.5*height - 2*fheight)), b[1]-b[0], fheight, facecolor='purple', | |
edgecolor='purple', linewidth=0.2, zorder=-1)) | |
ax.set_aspect(1.0) | |
plt.axis('off') | |
ax.set_xlim(-spacing, plot_len+spacing) | |
ax.set_ylim(-5*height, height*1.2) | |
plt.savefig("chr17_linear_ecdna_overlaps.png",bbox_inches='tight', dpi=300) | |
plt.savefig("chr17_linear_ecdna_overlaps.pdf", bbox_inches='tight') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment