Skip to content

Instantly share code, notes, and snippets.

@wbgalvao
Last active August 14, 2020 18:07
Show Gist options
  • Save wbgalvao/924b6aad83af0e59fa7d690644555a42 to your computer and use it in GitHub Desktop.
Save wbgalvao/924b6aad83af0e59fa7d690644555a42 to your computer and use it in GitHub Desktop.
Transform .bed files genomic coordinates from hg38 to hg19
import os
import sys
import pandas as pd
from pyliftover import LiftOver
target = sys.argv[1]
lo = LiftOver("hg38", "hg19")
unconverted_regions = open(
".".join(
[
os.path.basename(os.path.splitext(target)[0]),
"unconverted_regions",
"bed",
]
),
"w",
)
def get_hg19_coordinate(row):
chr = row["chromosome"]
start = row["start"]
end = row["end"]
try:
return lo.convert_coordinate(chr, start, end)[0]
except:
unconverted_regions.write(
"{}\t{}\t{}\n".format(chr, str(start), str(end))
)
return None
def get_region_notation(row):
notation = row["notation"]
region_notation = "|".join(notation.split("|")[:4])
return region_notation
def add_region_count(row):
notation = row["notation"]
count = row["region_count"]
return "{}|{}".format(count, notation)
hg38 = pd.read_csv(
target,
sep="\t",
header=None,
names=["chromosome", "start", "end", "notation"],
skiprows=2,
)
# Create new column in dataframe for identifying coordinates in the same gene
hg38["region_notation"] = hg38.apply(
lambda row: get_region_notation(row), axis=1
)
# Create new column in dataframe for enumerating coordinates in the same gene
hg38["region_count"] = hg38.groupby(["region_notation"]).cumcount()
# Add numeration to the "notation" column
hg38["notation"] = hg38.apply(lambda row: add_region_count(row), axis=1)
# Use LiftOver to convert current genomic coordinates - store results
# in the "converted_coordinates" column
hg38["converted_coordinates"] = hg38.apply(
lambda row: get_hg19_coordinate(row), axis=1
)
# Drop rows where the conversion could not be made
hg38 = hg38[hg38.converted_coordinates.notnull()]
# Split the "converted_coordinates" content into its individual columns
hg38["hg19_chromosome"] = hg38.apply(
lambda row: row["converted_coordinates"][0], axis=1
)
hg38["hg19_start"] = hg38.apply(
lambda row: row["converted_coordinates"][1], axis=1
)
hg38["hg19_end"] = hg38.apply(
lambda row: row["converted_coordinates"][3], axis=1
)
# Create new bed file with converted genomic coordinates
hg38.to_csv(
".".join([os.path.basename(os.path.splitext(target)[0]), "hg19", "bed"]),
sep="\t",
index=False,
header=False,
columns=["hg19_chromosome", "hg19_start", "hg19_end", "notation"],
)
unconverted_regions.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment