Skip to content

Instantly share code, notes, and snippets.

@gardiner
Created January 7, 2011 10:50
Show Gist options
  • Save gardiner/769351 to your computer and use it in GitHub Desktop.
Save gardiner/769351 to your computer and use it in GitHub Desktop.
from BCBio import GFF
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import pickle
import string
from pprint import pprint
def split_file():
"""takes the 20g file with all 5 chromosomes of all 81 ecotypes
and splits it into 5 separate files (one for each chromosome).
In the process it strips all tabs from the file and removes the
first two columns."""
f = open('TAIR9_genome_matrix_2010_10_18.txt')
t = None
count = 1
current = None
for line in f:
parts = line.strip().split('\t')
c = parts.pop(0)
n = parts.pop(0)
n = int(n.strip())
if c != current:
if t != None:
t.close()
print 'Next Chromosome', c, n, count-1
t = open('%s.txt' % c, 'w')
current = c
count = 1
if n != count:
diff = n - count
print 'Inserting %s empty rows after %i' % (diff, count - 1)
count += diff
for i in xrange(diff):
t.write('-' * 81 + '\n')
t.write(''.join(parts) + '\n')
count += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment