Skip to content

Instantly share code, notes, and snippets.

@sstevens2
Last active April 25, 2019 21:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sstevens2/2ed6ce9ddc83e051886e5650a5846d75 to your computer and use it in GitHub Desktop.
Save sstevens2/2ed6ce9ddc83e051886e5650a5846d75 to your computer and use it in GitHub Desktop.
python675_script
#!/usr/bin/env python
# Usage: python fasta-stats.py <filename>
# This script takes a fasta file and returns a tab delimited file which gives
# the gc content and length for each sequence in the original fasta
## Import statements
import sys
## Functions
def calc_gc(seq):
gc = seq.count('G') + seq.count('C')
return gc / len(seq)
def read_fasta(filename):
fasta = dict()
with open(filename, 'r') as f:
first = True
seq = ''
for line in f:
line = line.rstrip('\n')
if line.startswith('>'):
if first == False:
fasta[header] = seq
seq = ''
header = line
first = False
else:
seq = seq+line
return fasta
def usage():
if len(sys.argv) != 2:
print('Usage: python fasta-stats.py <filename>')
exit(2)
## Main script
### Checking for the right number of arguments
usage()
### read fasta into dictionary
infile = sys.argv[1]
sequences = read_fasta(infile)
#### for each sequence print/write the header\tGC\tlength
outfile = infile+'.stats.tsv'
with open(outfile, 'w') as output:
output.write('name\tGC\tlength\n')
for name, sequence in sequences.items():
output.write(name + '\t' + str(calc_gc(sequence)) + '\t' + str(len(sequence)) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment