sstevens2/fasta-stats.py

## fasta-stats.py
#!/usr/bin/env python


# Usage: python fasta-stats.py <filename>
# This script takes a fasta file and returns a tab delimited file which gives
# the gc content and length for each sequence in the original fasta

## Import statements
import sys

## Functions

def calc_gc(seq):
	gc = seq.count('G') + seq.count('C')
	return gc / len(seq)

def read_fasta(filename):
	fasta = dict()
	with open(filename, 'r') as f:
		first = True
		seq = ''
		for line in f:
			line = line.rstrip('\n')
			if line.startswith('>'):
				if first == False:
					fasta[header] = seq
					seq = ''
				header = line
				first = False
			else:
				seq = seq+line
	return fasta

def usage():
	if len(sys.argv) != 2:
		print('Usage: python fasta-stats.py <filename>')
		exit(2)


## Main script

### Checking for the right number of arguments
usage()

### read fasta into dictionary
infile = sys.argv[1]
sequences = read_fasta(infile)

#### for each sequence print/write the header\tGC\tlength
outfile = infile+'.stats.tsv'
with open(outfile, 'w') as output:
	output.write('name\tGC\tlength\n')
	for name, sequence in sequences.items():
		output.write(name + '\t' + str(calc_gc(sequence)) + '\t' + str(len(sequence)) + '\n')
	#!/usr/bin/env python


	# Usage: python fasta-stats.py <filename>
	# This script takes a fasta file and returns a tab delimited file which gives
	# the gc content and length for each sequence in the original fasta

	## Import statements
	import sys

	## Functions

	def calc_gc(seq):
	gc = seq.count('G') + seq.count('C')
	return gc / len(seq)

	def read_fasta(filename):
	fasta = dict()
	with open(filename, 'r') as f:
	first = True
	seq = ''
	for line in f:
	line = line.rstrip('\n')
	if line.startswith('>'):
	if first == False:
	fasta[header] = seq
	seq = ''
	header = line
	first = False
	else:
	seq = seq+line
	return fasta

	def usage():
	if len(sys.argv) != 2:
	print('Usage: python fasta-stats.py <filename>')
	exit(2)


	## Main script

	### Checking for the right number of arguments
	usage()

	### read fasta into dictionary
	infile = sys.argv[1]
	sequences = read_fasta(infile)

	#### for each sequence print/write the header\tGC\tlength
	outfile = infile+'.stats.tsv'
	with open(outfile, 'w') as output:
	output.write('name\tGC\tlength\n')
	for name, sequence in sequences.items():
	output.write(name + '\t' + str(calc_gc(sequence)) + '\t' + str(len(sequence)) + '\n')