Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Created December 28, 2017 13:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/e75a855fba9091cf3f962cc9a16994f5 to your computer and use it in GitHub Desktop.
Save afrendeiro/e75a855fba9091cf3f962cc9a16994f5 to your computer and use it in GitHub Desktop.
Get Ensembl regulatory build annotations into bed format
import os
import pandas as pd
# Human
# hg38
ensembl_release = 91
date = "20161111"
organisms = {
"homo_sapiens": "GRCh38",
"mus_musculus": "GRCm38"}
# Cross cell type features
for organism, assembly in organisms.items():
url = os.path.join(
"ftp://ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
"regulation/{}".format(organism),
"{}.{}.Regulatory_Build.regulatory_features.{}.gff.gz".format(
organism, assembly, date))
os.system("wget {}".format(url))
os.system("gzip -d {}".format(os.path.basename(url)))
gff_name = os.path.basename(url).replace(".gz", "")
df = pd.read_table(gff_name, header=None)
print("extracting annotation")
# extract further region annotations
annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
annot.columns = ['ID', 'bound_end', 'bound_start', 'description', 'feature_type']
df = df.join(annot)
print("writing to disk")
# output as BED
# all reg. elements
bed = df[[0, 3, 4, "ID"]]
bed.to_csv(os.path.join(gff_name.replace(".gff", ".bed")), header=False, index=False)
# Cell type-specific with activity
organism = "homo_sapiens"
assembly = "GRCh38"
cell_types = [
"A549", "Aorta", "B_cells_PB_Roadmap", "CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB",
"CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "CM_CD4_ab_T_cell_VB", "DND_41", "EPC_VB",
"Fetal_Adrenal_Gland", "Fetal_Intestine_Large", "Fetal_Intestine_Small", "Fetal_Muscle_Leg", "Fetal_Muscle_Trunk",
"Fetal_Stomach", "Fetal_Thymus", "GM12878", "Gastric", "H1ESC",
"H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "HMEC",
"HSMM", "HSMMtube", "HUVEC", "HUVEC_prol_CB", "HeLa_S3", "HepG2",
"IMR90", "K562", "Left_Ventricle", "Lung", "M0_macrophage_CB", "M0_macrophage_VB",
"M1_macrophage_CB", "M1_macrophage_VB", "M2_macrophage_CB", "M2_macrophage_VB", "MSC_VB", "Monocytes_CD14",
"Monocytes_CD14_PB_Roadmap", "NHDF_AD", "NHEK", "NHLF", "NH_A", "Natural_Killer_cells_PB",
"Osteobl", "Ovary", "Pancreas", "Placenta", "Psoas_Muscle", "Right_Atrium",
"Small_Intestine", "Spleen", "T_cells_PB_Roadmap", "Thymus", "eosinophil_VB", "erythroblast_CB",
"iPS_20b", "iPS_DF_19_11", "iPS_DF_6_9", "naive_B_cell_VB", "neutrophil_CB", "neutrophil_VB",
"neutrophil_myelocyte_BM"]
for cell_type in cell_types:
print(cell_type)
url = os.path.join(
"ftp://ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
"regulation/{}/RegulatoryFeatureActivity/".format(organism),
cell_type,
"{}.{}.{}.Regulatory_Build.regulatory_activity.{}.gff.gz".format(
organism, assembly, cell_type, date))
os.system("wget {}".format(url))
os.system("gzip -d {}".format(os.path.basename(url)))
gff_name = os.path.basename(url).replace(".gz", "")
df = pd.read_table(gff_name, header=None)
print("extracting annotation")
# extract further region annotations
annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
annot.columns = [
'activity', 'bound_end', 'bound_start','description',
'epigenome', 'feature_type', 'regulatory_feature_stable_id']
df = df.join(annot)
print("writing to disk")
# output as BED
# all reg. elements
bed = df[[0, 3, 4, "regulatory_feature_stable_id"]]
bed.to_csv(os.path.join(gff_name.replace(".gff", ".bed")), header=False, index=False)
# only active
bed = df.loc[df["activity"] == "ACTIVE", [0, 3, 4, "regulatory_feature_stable_id"]]
bed.to_csv(os.path.join(gff_name.replace(".gff", "active_only.bed")), header=False, index=False)
# GRCh19
ensembl_release = 75
organisms = {
"homo_sapiens": "GRCh19",
"mus_musculus": "GRCm19"}
cell_type = "MultiCell"
# Cross cell type features
for organism, assembly in organisms.items():
url = os.path.join(
"ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
"regulation/{}/RegulatoryFeatures_{}.gff.gz".format(organism, cell_type))
os.system("wget {}".format(url))
os.system("gzip -d {}".format(os.path.basename(url)))
gff_name = os.path.basename(url).replace(".gz", "")
df = pd.read_table(gff_name, header=None)
print("extracting annotation")
# extract further region annotations
annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
annot.columns = [
'name', 'id', 'bound_start','bound_end', 'note']
df = df.join(annot)
print("writing to disk")
# output as BED
# all reg. elements
bed = df[[0, 3, 4, "id"]]
bed.to_csv(os.path.join(".".join([organism, assembly, gff_name.replace(".gff", ".bed")]), header=False, index=False)
os.system("rm {}".format(gff_name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment