Skip to content

Instantly share code, notes, and snippets.

@geocarvalho
Last active March 4, 2021 04:14
Show Gist options
  • Save geocarvalho/8ce0179d4a503b8639199f34d15675eb to your computer and use it in GitHub Desktop.
Save geocarvalho/8ce0179d4a503b8639199f34d15675eb to your computer and use it in GitHub Desktop.
import pandas as pd
file = "input.bed"
df = pd.read_csv(file, sep="\t", names=["chr", "start", "end", "interval", "score", "strand"])
df[["gene", "extra"]] = df["interval"].str.split("_", 1, expand=True)
df.drop(["interval", "score", "strand", "extra"], axis=1, inplace=True)
new_df = df.groupby("gene").agg({"chr":"unique", "start":min, "end":max})
new_df.reset_index(inplace=True)
new_df["chr"] = new_df["chr"].apply(lambda chr: chr[0])
new_df["start"] = new_df["start"].astype("str")
new_df["end"] = new_df["end"].astype("str")
new_df["list"] = new_df["chr"] + ":" + new_df["start"] + "-" + new_df["end"]
new_df["list"] = new_df["list"].str.replace("chr", "")
new_df["chr"] = new_df["chr"].str.replace("chr", "")
new_df["chr"] = new_df["chr"].str.replace("X", "23")
new_df["chr"] = new_df["chr"].str.replace("Y", "24")
new_df[["chr", "start", "end"]] = new_df[["chr", "start", "end"]].astype(int)
new_df.sort_values(by=["chr", "start", "end"], inplace=True)
new_df["list"].to_csv("input.list", index=False, header=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment