Skip to content

Instantly share code, notes, and snippets.

@gardiner
Created January 7, 2011 10:52
Show Gist options
  • Save gardiner/769353 to your computer and use it in GitHub Desktop.
Save gardiner/769353 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
from BCBio import GFF
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import pickle
import string
from pprint import pprint
def split_file():
"""takes the 20g file with all 5 chromosomes of all 81 ecotypes
and splits it into 5 separate files (one for each chromosome).
In the process it strips all tabs from the file and removes the
first two columns."""
f = open('TAIR9_genome_matrix_2010_10_18.txt')
t = None
count = 1
current = None
for line in f:
parts = line.strip().split('\t')
c = parts.pop(0)
n = parts.pop(0)
n = int(n.strip())
if c != current:
if t != None:
t.close()
print 'Next Chromosome', c, n, count-1
t = open('%s.txt' % c, 'w')
current = c
count = 1
if n != count:
diff = n - count
print 'Inserting %s empty rows after %i' % (diff, count - 1)
count += diff
for i in xrange(diff):
t.write('-' * 81 + '\n')
t.write(''.join(parts) + '\n')
count += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment