wbgalvao/hg38_to_hg19.py

## hg38_to_hg19.py
import os
import sys

import pandas as pd

from pyliftover import LiftOver


target = sys.argv[1]

lo = LiftOver("hg38", "hg19")
unconverted_regions = open(
    ".".join(
        [
            os.path.basename(os.path.splitext(target)[0]),
            "unconverted_regions",
            "bed",
        ]
    ),
    "w",
)


def get_hg19_coordinate(row):
    chr = row["chromosome"]
    start = row["start"]
    end = row["end"]
    try:
        return lo.convert_coordinate(chr, start, end)[0]
    except:
        unconverted_regions.write(
            "{}\t{}\t{}\n".format(chr, str(start), str(end))
        )
        return None


def get_region_notation(row):
    notation = row["notation"]
    region_notation = "|".join(notation.split("|")[:4])
    return region_notation


def add_region_count(row):
    notation = row["notation"]
    count = row["region_count"]
    return "{}|{}".format(count, notation)


hg38 = pd.read_csv(
    target,
    sep="\t",
    header=None,
    names=["chromosome", "start", "end", "notation"],
    skiprows=2,
)


# Create new column in dataframe for identifying coordinates in the same gene
hg38["region_notation"] = hg38.apply(
    lambda row: get_region_notation(row), axis=1
)

# Create new column in dataframe for enumerating coordinates in the same gene
hg38["region_count"] = hg38.groupby(["region_notation"]).cumcount()

# Add numeration to the "notation" column
hg38["notation"] = hg38.apply(lambda row: add_region_count(row), axis=1)

# Use LiftOver to convert current genomic coordinates - store results
# in the "converted_coordinates" column
hg38["converted_coordinates"] = hg38.apply(
    lambda row: get_hg19_coordinate(row), axis=1
)

# Drop rows where the conversion could not be made
hg38 = hg38[hg38.converted_coordinates.notnull()]

# Split the "converted_coordinates" content into its individual columns
hg38["hg19_chromosome"] = hg38.apply(
    lambda row: row["converted_coordinates"][0], axis=1
)
hg38["hg19_start"] = hg38.apply(
    lambda row: row["converted_coordinates"][1], axis=1
)
hg38["hg19_end"] = hg38.apply(
    lambda row: row["converted_coordinates"][3], axis=1
)

# Create new bed file with converted genomic coordinates
hg38.to_csv(
    ".".join([os.path.basename(os.path.splitext(target)[0]), "hg19", "bed"]),
    sep="\t",
    index=False,
    header=False,
    columns=["hg19_chromosome", "hg19_start", "hg19_end", "notation"],
)

unconverted_regions.close()
	import os
	import sys

	import pandas as pd

	from pyliftover import LiftOver


	target = sys.argv[1]

	lo = LiftOver("hg38", "hg19")
	unconverted_regions = open(
	".".join(
	[
	os.path.basename(os.path.splitext(target)[0]),
	"unconverted_regions",
	"bed",
	]
	),
	"w",
	)


	def get_hg19_coordinate(row):
	chr = row["chromosome"]
	start = row["start"]
	end = row["end"]
	try:
	return lo.convert_coordinate(chr, start, end)[0]
	except:
	unconverted_regions.write(
	"{}\t{}\t{}\n".format(chr, str(start), str(end))
	)
	return None


	def get_region_notation(row):
	notation = row["notation"]
	region_notation = "\|".join(notation.split("\|")[:4])
	return region_notation


	def add_region_count(row):
	notation = row["notation"]
	count = row["region_count"]
	return "{}\|{}".format(count, notation)


	hg38 = pd.read_csv(
	target,
	sep="\t",
	header=None,
	names=["chromosome", "start", "end", "notation"],
	skiprows=2,
	)


	# Create new column in dataframe for identifying coordinates in the same gene
	hg38["region_notation"] = hg38.apply(
	lambda row: get_region_notation(row), axis=1
	)

	# Create new column in dataframe for enumerating coordinates in the same gene
	hg38["region_count"] = hg38.groupby(["region_notation"]).cumcount()

	# Add numeration to the "notation" column
	hg38["notation"] = hg38.apply(lambda row: add_region_count(row), axis=1)

	# Use LiftOver to convert current genomic coordinates - store results
	# in the "converted_coordinates" column
	hg38["converted_coordinates"] = hg38.apply(
	lambda row: get_hg19_coordinate(row), axis=1
	)

	# Drop rows where the conversion could not be made
	hg38 = hg38[hg38.converted_coordinates.notnull()]

	# Split the "converted_coordinates" content into its individual columns
	hg38["hg19_chromosome"] = hg38.apply(
	lambda row: row["converted_coordinates"][0], axis=1
	)
	hg38["hg19_start"] = hg38.apply(
	lambda row: row["converted_coordinates"][1], axis=1
	)
	hg38["hg19_end"] = hg38.apply(
	lambda row: row["converted_coordinates"][3], axis=1
	)

	# Create new bed file with converted genomic coordinates
	hg38.to_csv(
	".".join([os.path.basename(os.path.splitext(target)[0]), "hg19", "bed"]),
	sep="\t",
	index=False,
	header=False,
	columns=["hg19_chromosome", "hg19_start", "hg19_end", "notation"],
	)

	unconverted_regions.close()