Skip to content

Instantly share code, notes, and snippets.

@geocarvalho
Last active May 23, 2017 12:35
Show Gist options
  • Save geocarvalho/ea1cc42b378a943c603bf418c13b6e5d to your computer and use it in GitHub Desktop.
Save geocarvalho/ea1cc42b378a943c603bf418c13b6e5d to your computer and use it in GitHub Desktop.
list_from_bed.py
import pandas as pd
import numpy as np
import os
import sys
bed_file = sys.argv[1]
name = bed_file.split('.')[0]
#import bed into dataframe
header = ['Chr', 'Start', 'End', 'Gene']
bed_df = pd.read_csv(bed_file, sep='\t', header=0, names=header, \
usecols=[0,1,2,3])
bed_df['Gene'] = bed_df['Gene'].apply(lambda x: x.split('.')[0])
list_df = pd.DataFrame(columns=['Chr', 'Start', 'End'])
#print bed_df['Gene'].unique()
f = open('%s.list'%name, 'w')
#print bed_df['Gene'].unique()
for gene in bed_df['Gene'].unique():
#print gene
temp_df = bed_df[bed_df['Gene'] == gene]
min_position = temp_df['Start'].min()
max_position = temp_df['End'].max()
if len(temp_df['Chr'].unique()) == 1:
chromossome = temp_df['Chr'].iloc[0]
else:
print "Gene includes more than one chromossome, check the bed_file"
break
# new_inputs = [{'Chr' : chromossome, 'Start': min_position, 'End': max_position}]
# new_df = pd.DataFrame(new_inputs)
# list_df = list_df.append(new_df)
string_to_write = str(chromossome) + ":" + str(min_position) + "-" \
+ str(max_position) + "\n"
f.write(string_to_write)
print string_to_write
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment