zxexz/countBigram_v2.py

## countBigram_v2.py
#
# Author: Eric Marriott
# Date: 10 Feb. 2015
# This file is a much more concise and (in my opinion) prettier version of my original countBigram.py...I thought I improved the algorithm with zip() and a list comprehension, but it runs almost a whole second slower on human22.fasta!
#

# import the system library for file IO
import sys
# first line of fasta file not relevant here, so read that but don't assign it to a variable
f = open(sys.argv[1], 'r')
f.readline()
# read the rest of the file into a variable as uppercase, removing newlines
a = f.read().upper().replace('\n', '')
f.close()
# function for printing bigrams goes here
def printBigrams(c):
    b = "ATCG"
    printThis = ""
    for i in b:
        for j in b:
            printThis += str(i+j) + ": " + str(c[i+j]) + '\t'
        printThis += '\n'
    print printThis
### I disabled this and merged the lambda with the countBigrams function because it was silly, honestly (by that I mean it was cool but caused a non-marginal performance hit)
## I know lambda functions don't really go with the python spirit, but neither do I.  Overjoyed was I when I learned about python's (somewhat) decent FP capabilities...and list comprehensions!
## zip() is wonderful because it doesn't get all whiny when it reaches the end of a list, it just stops, getting rid of all those silly array indexing issues
## bigrams = lambda x: [y+z for (y, z) in zip(x, x[1::])]
###
# this is just a hash table ('dictionary') with a key for each possible bigram
possibles = {
    'AA': 0, 'AT': 0, 'AC': 0, 'AG': 0,
    'TA': 0, 'TT': 0, 'TC': 0, 'TG': 0,
    'CA': 0, 'CT': 0, 'CC': 0, 'CG': 0,
    'GA': 0, 'GT': 0, 'GC': 0, 'GG': 0 }

def countBigrams(c):
# zip() is wonderful because it doesn't get all whiny when it reaches the end of a list, it just stops, getting rid of all those silly array indexing issues
    for i in [y+z for (y, z) in zip(c, c[1::])]:
        # this is only useful if we were using the bigrams function
        # for i in bigrams(c):
        possibles[i] += 1
countBigrams(a)
printBigrams(possibles)
	#
	# Author: Eric Marriott
	# Date: 10 Feb. 2015
	# This file is a much more concise and (in my opinion) prettier version of my original countBigram.py...I thought I improved the algorithm with zip() and a list comprehension, but it runs almost a whole second slower on human22.fasta!
	#

	# import the system library for file IO
	import sys
	# first line of fasta file not relevant here, so read that but don't assign it to a variable
	f = open(sys.argv[1], 'r')
	f.readline()
	# read the rest of the file into a variable as uppercase, removing newlines
	a = f.read().upper().replace('\n', '')
	f.close()
	# function for printing bigrams goes here
	def printBigrams(c):
	b = "ATCG"
	printThis = ""
	for i in b:
	for j in b:
	printThis += str(i+j) + ": " + str(c[i+j]) + '\t'
	printThis += '\n'
	print printThis
	### I disabled this and merged the lambda with the countBigrams function because it was silly, honestly (by that I mean it was cool but caused a non-marginal performance hit)
	## I know lambda functions don't really go with the python spirit, but neither do I. Overjoyed was I when I learned about python's (somewhat) decent FP capabilities...and list comprehensions!
	## zip() is wonderful because it doesn't get all whiny when it reaches the end of a list, it just stops, getting rid of all those silly array indexing issues
	## bigrams = lambda x: [y+z for (y, z) in zip(x, x[1::])]
	###
	# this is just a hash table ('dictionary') with a key for each possible bigram
	possibles = {
	'AA': 0, 'AT': 0, 'AC': 0, 'AG': 0,
	'TA': 0, 'TT': 0, 'TC': 0, 'TG': 0,
	'CA': 0, 'CT': 0, 'CC': 0, 'CG': 0,
	'GA': 0, 'GT': 0, 'GC': 0, 'GG': 0 }

	def countBigrams(c):
	# zip() is wonderful because it doesn't get all whiny when it reaches the end of a list, it just stops, getting rid of all those silly array indexing issues
	for i in [y+z for (y, z) in zip(c, c[1::])]:
	# this is only useful if we were using the bigrams function
	# for i in bigrams(c):
	possibles[i] += 1
	countBigrams(a)
	printBigrams(possibles)