Skip to content

Instantly share code, notes, and snippets.

@lukauskas
Last active September 3, 2020 15:17
Show Gist options
  • Save lukauskas/1e4a8a49309a707c200ccb2e2ab24df6 to your computer and use it in GitHub Desktop.
Save lukauskas/1e4a8a49309a707c200ccb2e2ab24df6 to your computer and use it in GitHub Desktop.
Convert gene/transcript bed file to a TSS bed file using `pandas`
import pandas as pd
def transcript_to_tss(transcript_bed, tss_bed):
"""
Convert transcript bed file to TSS bed file
Strand aware
"""
# Read bedfile to pandas
df = pd.read_csv(transcript_bed,
sep='\t',
names=['chrom', 'start', 'end', 'name', 'score', 'strand'])
# Tss will store the start coordinate for TSS
df['tss'] = None
df.loc[df['strand'] == '+', 'tss'] = df.loc[df['strand'] == '+', 'start']
# Negative strand transcripts start at position -1 from end
# as end coordinate in bed is not inclusive
df.loc[df['strand'] == '-', 'tss'] = df.loc[df['strand'] == '-', 'end'] - 1
# Tss is the new start
df['start'] = df['tss'].astype(int)
# Regardless of direction, "end" of TSS is start +1 as length=1
df['end'] = df['start'] + 1
# Re-sort and save to bed file again.
df = df[['chrom', 'start', 'end', 'name', 'score', 'strand']]
df = df.sort_values(by=['chrom', 'start', 'end'])
df.to_csv(tss_bed, sep='\t', header=False, index=False)
if __name__ == '__main__':
transcript_to_tss('transcripts.bed', 'tss.bed')
chr5 126423409 126494364 PlusStrand 1 +
chr5 126531200 126595219 MinusStrand 1 -
chr5 126423409 126423410 PlusStrand 1 +
chr5 126595218 126595219 MinusStrand 1 -
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment