Last active
April 25, 2019 21:56
-
-
Save sstevens2/2ed6ce9ddc83e051886e5650a5846d75 to your computer and use it in GitHub Desktop.
python675_script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Usage: python fasta-stats.py <filename> | |
# This script takes a fasta file and returns a tab delimited file which gives | |
# the gc content and length for each sequence in the original fasta | |
## Import statements | |
import sys | |
## Functions | |
def calc_gc(seq): | |
gc = seq.count('G') + seq.count('C') | |
return gc / len(seq) | |
def read_fasta(filename): | |
fasta = dict() | |
with open(filename, 'r') as f: | |
first = True | |
seq = '' | |
for line in f: | |
line = line.rstrip('\n') | |
if line.startswith('>'): | |
if first == False: | |
fasta[header] = seq | |
seq = '' | |
header = line | |
first = False | |
else: | |
seq = seq+line | |
return fasta | |
def usage(): | |
if len(sys.argv) != 2: | |
print('Usage: python fasta-stats.py <filename>') | |
exit(2) | |
## Main script | |
### Checking for the right number of arguments | |
usage() | |
### read fasta into dictionary | |
infile = sys.argv[1] | |
sequences = read_fasta(infile) | |
#### for each sequence print/write the header\tGC\tlength | |
outfile = infile+'.stats.tsv' | |
with open(outfile, 'w') as output: | |
output.write('name\tGC\tlength\n') | |
for name, sequence in sequences.items(): | |
output.write(name + '\t' + str(calc_gc(sequence)) + '\t' + str(len(sequence)) + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment