afrendeiro/ensembl_reg_build.py

## ensembl_reg_build.py
import os
import pandas as pd

# Human
# hg38
ensembl_release = 91
date = "20161111"

organisms = {
    "homo_sapiens": "GRCh38",
    "mus_musculus": "GRCm38"}

# Cross cell type features
for organism, assembly in organisms.items():
    url = os.path.join(
        "ftp://ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
        "regulation/{}".format(organism),
        "{}.{}.Regulatory_Build.regulatory_features.{}.gff.gz".format(
            organism, assembly, date))
    os.system("wget {}".format(url))
    os.system("gzip -d {}".format(os.path.basename(url)))

    gff_name = os.path.basename(url).replace(".gz", "")
    df = pd.read_table(gff_name, header=None)

    print("extracting annotation")
    # extract further region annotations
    annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
    annot.columns = ['ID', 'bound_end', 'bound_start', 'description', 'feature_type']
    df = df.join(annot)

    print("writing to disk")
    # output as BED
    # all reg. elements
    bed = df[[0, 3, 4, "ID"]]
    bed.to_csv(os.path.join(gff_name.replace(".gff", ".bed")), header=False, index=False)


# Cell type-specific with activity
organism = "homo_sapiens"
assembly = "GRCh38"
cell_types = [
    "A549", "Aorta", "B_cells_PB_Roadmap", "CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB",
    "CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "CM_CD4_ab_T_cell_VB", "DND_41", "EPC_VB",
    "Fetal_Adrenal_Gland", "Fetal_Intestine_Large", "Fetal_Intestine_Small", "Fetal_Muscle_Leg", "Fetal_Muscle_Trunk",
    "Fetal_Stomach", "Fetal_Thymus", "GM12878", "Gastric", "H1ESC",
    "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "HMEC",
    "HSMM", "HSMMtube", "HUVEC", "HUVEC_prol_CB", "HeLa_S3", "HepG2",
    "IMR90", "K562", "Left_Ventricle", "Lung", "M0_macrophage_CB", "M0_macrophage_VB",
    "M1_macrophage_CB", "M1_macrophage_VB", "M2_macrophage_CB", "M2_macrophage_VB", "MSC_VB", "Monocytes_CD14",
    "Monocytes_CD14_PB_Roadmap", "NHDF_AD", "NHEK", "NHLF", "NH_A", "Natural_Killer_cells_PB",
    "Osteobl", "Ovary", "Pancreas", "Placenta", "Psoas_Muscle", "Right_Atrium",
    "Small_Intestine", "Spleen", "T_cells_PB_Roadmap", "Thymus", "eosinophil_VB", "erythroblast_CB",
    "iPS_20b", "iPS_DF_19_11", "iPS_DF_6_9", "naive_B_cell_VB", "neutrophil_CB", "neutrophil_VB",
    "neutrophil_myelocyte_BM"]

for cell_type in cell_types:
    print(cell_type)

    url = os.path.join(
        "ftp://ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
        "regulation/{}/RegulatoryFeatureActivity/".format(organism),
        cell_type,
        "{}.{}.{}.Regulatory_Build.regulatory_activity.{}.gff.gz".format(
            organism, assembly, cell_type, date))
    os.system("wget {}".format(url))
    os.system("gzip -d {}".format(os.path.basename(url)))

    gff_name = os.path.basename(url).replace(".gz", "")
    df = pd.read_table(gff_name, header=None)

    print("extracting annotation")
    # extract further region annotations
    annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
    annot.columns = [
        'activity', 'bound_end', 'bound_start','description',
        'epigenome', 'feature_type', 'regulatory_feature_stable_id']
    df = df.join(annot)

    print("writing to disk")
    # output as BED
    # all reg. elements
    bed = df[[0, 3, 4, "regulatory_feature_stable_id"]]
    bed.to_csv(os.path.join(gff_name.replace(".gff", ".bed")), header=False, index=False)

    # only active
    bed = df.loc[df["activity"] == "ACTIVE", [0, 3, 4, "regulatory_feature_stable_id"]]
    bed.to_csv(os.path.join(gff_name.replace(".gff", "active_only.bed")), header=False, index=False)


# GRCh19
ensembl_release = 75
organisms = {
    "homo_sapiens": "GRCh19",
    "mus_musculus": "GRCm19"}
cell_type = "MultiCell"

# Cross cell type features
for organism, assembly in organisms.items():
    url = os.path.join(
        "ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
        "regulation/{}/RegulatoryFeatures_{}.gff.gz".format(organism, cell_type))
    os.system("wget {}".format(url))
    os.system("gzip -d {}".format(os.path.basename(url)))

    gff_name = os.path.basename(url).replace(".gz", "")
    df = pd.read_table(gff_name, header=None)

    print("extracting annotation")
    # extract further region annotations
    annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
    annot.columns = [
        'name', 'id', 'bound_start','bound_end', 'note']
    df = df.join(annot)

    print("writing to disk")
    # output as BED
    # all reg. elements
    bed = df[[0, 3, 4, "id"]]
    bed.to_csv(os.path.join(".".join([organism, assembly, gff_name.replace(".gff", ".bed")]), header=False, index=False)

    os.system("rm {}".format(gff_name))
	import os
	import pandas as pd

	# Human
	# hg38
	ensembl_release = 91
	date = "20161111"

	organisms = {
	"homo_sapiens": "GRCh38",
	"mus_musculus": "GRCm38"}

	# Cross cell type features
	for organism, assembly in organisms.items():
	url = os.path.join(
	"ftp://ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
	"regulation/{}".format(organism),
	"{}.{}.Regulatory_Build.regulatory_features.{}.gff.gz".format(
	organism, assembly, date))
	os.system("wget {}".format(url))
	os.system("gzip -d {}".format(os.path.basename(url)))

	gff_name = os.path.basename(url).replace(".gz", "")
	df = pd.read_table(gff_name, header=None)

	print("extracting annotation")
	# extract further region annotations
	annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
	annot.columns = ['ID', 'bound_end', 'bound_start', 'description', 'feature_type']
	df = df.join(annot)

	print("writing to disk")
	# output as BED
	# all reg. elements
	bed = df[[0, 3, 4, "ID"]]
	bed.to_csv(os.path.join(gff_name.replace(".gff", ".bed")), header=False, index=False)


	# Cell type-specific with activity
	organism = "homo_sapiens"
	assembly = "GRCh38"
	cell_types = [
	"A549", "Aorta", "B_cells_PB_Roadmap", "CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB",
	"CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "CM_CD4_ab_T_cell_VB", "DND_41", "EPC_VB",
	"Fetal_Adrenal_Gland", "Fetal_Intestine_Large", "Fetal_Intestine_Small", "Fetal_Muscle_Leg", "Fetal_Muscle_Trunk",
	"Fetal_Stomach", "Fetal_Thymus", "GM12878", "Gastric", "H1ESC",
	"H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "HMEC",
	"HSMM", "HSMMtube", "HUVEC", "HUVEC_prol_CB", "HeLa_S3", "HepG2",
	"IMR90", "K562", "Left_Ventricle", "Lung", "M0_macrophage_CB", "M0_macrophage_VB",
	"M1_macrophage_CB", "M1_macrophage_VB", "M2_macrophage_CB", "M2_macrophage_VB", "MSC_VB", "Monocytes_CD14",
	"Monocytes_CD14_PB_Roadmap", "NHDF_AD", "NHEK", "NHLF", "NH_A", "Natural_Killer_cells_PB",
	"Osteobl", "Ovary", "Pancreas", "Placenta", "Psoas_Muscle", "Right_Atrium",
	"Small_Intestine", "Spleen", "T_cells_PB_Roadmap", "Thymus", "eosinophil_VB", "erythroblast_CB",
	"iPS_20b", "iPS_DF_19_11", "iPS_DF_6_9", "naive_B_cell_VB", "neutrophil_CB", "neutrophil_VB",
	"neutrophil_myelocyte_BM"]

	for cell_type in cell_types:
	print(cell_type)

	url = os.path.join(
	"ftp://ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
	"regulation/{}/RegulatoryFeatureActivity/".format(organism),
	cell_type,
	"{}.{}.{}.Regulatory_Build.regulatory_activity.{}.gff.gz".format(
	organism, assembly, cell_type, date))
	os.system("wget {}".format(url))
	os.system("gzip -d {}".format(os.path.basename(url)))

	gff_name = os.path.basename(url).replace(".gz", "")
	df = pd.read_table(gff_name, header=None)

	print("extracting annotation")
	# extract further region annotations
	annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
	annot.columns = [
	'activity', 'bound_end', 'bound_start','description',
	'epigenome', 'feature_type', 'regulatory_feature_stable_id']
	df = df.join(annot)

	print("writing to disk")
	# output as BED
	# all reg. elements
	bed = df[[0, 3, 4, "regulatory_feature_stable_id"]]
	bed.to_csv(os.path.join(gff_name.replace(".gff", ".bed")), header=False, index=False)

	# only active
	bed = df.loc[df["activity"] == "ACTIVE", [0, 3, 4, "regulatory_feature_stable_id"]]
	bed.to_csv(os.path.join(gff_name.replace(".gff", "active_only.bed")), header=False, index=False)


	# GRCh19
	ensembl_release = 75
	organisms = {
	"homo_sapiens": "GRCh19",
	"mus_musculus": "GRCm19"}
	cell_type = "MultiCell"

	# Cross cell type features
	for organism, assembly in organisms.items():
	url = os.path.join(
	"ftp.ensembl.org/pub/release-{}/".format(ensembl_release),
	"regulation/{}/RegulatoryFeatures_{}.gff.gz".format(organism, cell_type))
	os.system("wget {}".format(url))
	os.system("gzip -d {}".format(os.path.basename(url)))

	gff_name = os.path.basename(url).replace(".gz", "")
	df = pd.read_table(gff_name, header=None)

	print("extracting annotation")
	# extract further region annotations
	annot = df[8].str.split(";").apply(pd.Series).stack().str.replace(".*=", "").unstack()
	annot.columns = [
	'name', 'id', 'bound_start','bound_end', 'note']
	df = df.join(annot)

	print("writing to disk")
	# output as BED
	# all reg. elements
	bed = df[[0, 3, 4, "id"]]
	bed.to_csv(os.path.join(".".join([organism, assembly, gff_name.replace(".gff", ".bed")]), header=False, index=False)

	os.system("rm {}".format(gff_name))