Skip to content

Instantly share code, notes, and snippets.

Last active December 27, 2015 12:29
Show Gist options
  • Save cbergman/7325650 to your computer and use it in GitHub Desktop.
Save cbergman/7325650 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
i = 0
ovlpstr = ''
ovlpln = 10 #20 , 2 for test
chnkln = 100 #200, 5 for test
start = 0
end = 0
chr = ''
dna = ''
filenum = 0
chr_num = 1;
# This module can handle multiple files
for arg in sys.argv[1:] :
filename = arg
print filename
fa_in = open( filename, 'r' )
#fa_out = open( filename, 'r' )
fa_out = open( filename + '.ppd', 'w' ) # in case of T.v, there'll too many files
ovlpstr = ''
dna = ''
start = 0
end = 0
i = 0
for line in fa_in : # read hg19.fa, multi chromosomes in one file
if i in range(chnkln) :
if line[0] == '>': # new chromosome
if i != 0 : # there is a previous chromosome
fa_out.write(chr + ':' + str(start) + '-' + str(end) + '>\t' + dna +'\n')
else :
ovlpstr = ''
dna = ''
start = 0
end = 0
i = 0
chr = line.split()[0].split('|')[0].strip('>').lstrip('chr')
if chr.isdigit() == True or chr == "X" or chr == "Y" :
chr = 'chr' + chr
else :
if ( "MT" in line ) or ( "mito" in line ) :
chr = 'chrMT'
elif ( "chr" in line ) or ( "CHR" in line ) :
#print chr
chr = 'chr' + chr
#chr = 'chr' + str(chr_num)
#chr_num = chr_num + 1
elif "|" in line :
print line.split("|")[1]
print "T. vaginalis case"
else :
chr = filename.split('/')[len(filename.split('/'))-1].split(".")[0]
fa_out = open( chr + '.fna.ppd', 'w' )
print str(chr+'.fna.ppd'), "is being processed"
else :
if i >= (chnkln - ovlpln) :
ovlpstr = ovlpstr + line.strip()
elif i == 0 :
if ovlpstr == '' : # The first line
start = 1
end = 0
else : # From second line, consider overlap
dna = ovlpstr
start = end - len(ovlpstr) + 1
i = ovlpln
ovlpstr = ''
else :
dna = dna + line.strip()
end = end + len(line.strip())
if i == (chnkln - 1) :
fa_out.write(chr + ':' + str(start) + '-' + str(end) + '>\t' + dna +'\n')
i = 0
else :
i = i + 1
#flush buffer
fa_out.write(chr + ':' + str(start) + '-' + str(end) + '>\t' + dna +'\n')
# clean up
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment