Skip to content

Instantly share code, notes, and snippets.

@julianhess
Last active April 13, 2021 01:17
Show Gist options
  • Save julianhess/b2bdb38733f3c61885c2564a17d53c12 to your computer and use it in GitHub Desktop.
Save julianhess/b2bdb38733f3c61885c2564a17d53c12 to your computer and use it in GitHub Desktop.
Parse in chromosome arm intervals from cytoBand.txt supplied with a reference build
import numpy as np
import pandas as pd
_chrmap = dict(zip(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]], range(1, 25)))
def parse_cytoband(cytoband):
cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"])
cband["chr"] = cband["chr"].apply(lambda x : _chrmap[x])
chrs = cband["chr"].unique()
ints = dict(zip(chrs, [{0} for _ in range(0, len(chrs))]))
last_end = None
last_stain = None
last_chrom = None
for _, chrom, start, end, _, stain in cband.itertuples():
if start == 0:
if last_end is not None:
ints[last_chrom].add(last_end)
if stain == "acen" and last_stain != "acen":
ints[chrom].add(start)
if stain != "acen" and last_stain == "acen":
ints[chrom].add(start)
last_end = end
last_stain = stain
last_chrom = chrom
ints[chrom].add(end)
CI = np.full([len(ints), 4], 0)
for c in chrs:
CI[c - 1, :] = sorted(ints[c])
return pd.DataFrame(
np.c_[np.tile(np.c_[np.r_[1:25]], [1, 2]).reshape(-1, 1), CI.reshape(-1, 2)],
columns = ["chr", "start", "end"]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment