Skip to content

Instantly share code, notes, and snippets.

@geocarvalho
Last active July 26, 2023 17:06
Show Gist options
  • Save geocarvalho/9f92f49d5051b68514a3bc167736375e to your computer and use it in GitHub Desktop.
Save geocarvalho/9f92f49d5051b68514a3bc167736375e to your computer and use it in GitHub Desktop.
(up) need to add other columns
#!/usr/bin/python3
from pyliftover import LiftOver
import pandas as pd
import argparse
import mapply
import sys
import os
mapply.init(
n_workers=8,
chunk_size=100,
max_chunks_per_worker=8,
progressbar=False
)
def get_coordinate(row, lo):
""" Translate chr, start and end postions from hg19 to hg38 """
chr = row[0]
start = row[1]
end = row[2]
try:
# We make start and end separated
coord_start = lo.convert_coordinate(chr, start)[0]
coord_end = lo.convert_coordinate(chr, end)[0]
return [coord_start[0], coord_start[1], coord_start[2], coord_end[1]]
except:
return ["not available", 0, "-", 0]
def main(args):
""" Args function to run liftover from hg19 to hg38 """
bed = os.path.abspath(args.bed)
genome = args.genome
output_dir = os.path.abspath(args.output_dir)
if genome == "hg38":
lo = LiftOver('hg19', 'hg38')
else:
lo = LiftOver('hg38', 'hg19')
df = pd.read_csv(bed, header=None, sep="\t")
# Create new column with the region translated
df["region_notation"] = df.mapply(
lambda row: get_coordinate(row, lo), axis=1
)
# Split the "region_notation" content into its individual columns
df[f"{genome}_chr"] = df.apply(
lambda row: row["region_notation"][0], axis=1
)
df[f"{genome}_start"] = df.apply(
lambda row: row["region_notation"][1], axis=1
)
df[f"{genome}_end"] = df.apply(
lambda row: row["region_notation"][3], axis=1
)
# filter the rows that wasn't translated and that start is greater than end
coord_df = df[~df[f"{genome}_chr"].str.contains("not")]
filtered_df = coord_df[coord_df[f"{genome}_start"]<coord_df[f"{genome}end"]]
filtered_df = filtered_df.sort_values([f"{genome}_chr", f"{genome}_start", f"{genome}_end"])
del df, coord_df
# Create new bed file with converted genomic coordinates
filtered_df.to_csv(
bed.replace(".bed", f"_{genome}.bed"),
sep="\t",
index=False,
header=False,
columns=[f"{genome}_chr", f"{genome}_start", f"{genome}_end"]
)
if __name__ == "__main__":
""" Arguments for the main function """
parser = argparse.ArgumentParser(
description="Script to liftover BED", usage='''Usage:
python liftover_bed.py -b file.bed
''')
parser.add_argument("-b", "--bed", help="BED file", type=str, required=True)
parser.add_argument("-o", "--output_dir", help="Path to output directory", type=str)
parser.add_argument("-g", "--genome", help="Genome you want to translate to (default: hg38 [hg19 > hg38])", type=str, required=False, default="hg38")
parser.set_defaults(func=main)
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
args.func(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment