ipurusho/GC_calc.py

## GC_calc.py
"""GC_calc.py"""
import sys
from pyspark import SparkContext
import re


#turns Fasta file into a list of sequences (for current understanding of pyspark SparkContext input)
fastaFile = sys.argv[1]


sc = SparkContext(appName="GC calc") #create spark context (main entry point for spark functionality)
fastaData = sc.textFile(fastaFile) #creates a Resilient Distrubuted Dataset (RDD) from text file

#names = fastaData.filter(lambda x:  x.startswith(">")).map(lambda x: x.strip(">"))
#sequence_length = fastaData.filter(lambda x: not x.startswith(">")).map(lambda x: len(x))
# GC_length = fastaData.filter(lambda x: not x.startswith(">")).map(lambda x: re.findall("[G|A]",x)).map(lambda x: len(x)).collect()


GC_content = fastaData.filter(lambda x: not x.startswith(">")).map(lambda x: (re.findall("[G|A]",x),x)) \
                       .map(lambda x: (len(x[0]),len(x[1]))) \
                       .map(lambda x: float(x[0])/float(x[1])).map(lambda x: x*100).collect()
	"""GC_calc.py"""
	import sys
	from pyspark import SparkContext
	import re



	#turns Fasta file into a list of sequences (for current understanding of pyspark SparkContext input)
	fastaFile = sys.argv[1]


	sc = SparkContext(appName="GC calc") #create spark context (main entry point for spark functionality)
	fastaData = sc.textFile(fastaFile) #creates a Resilient Distrubuted Dataset (RDD) from text file

	#names = fastaData.filter(lambda x: x.startswith(">")).map(lambda x: x.strip(">"))
	#sequence_length = fastaData.filter(lambda x: not x.startswith(">")).map(lambda x: len(x))
	# GC_length = fastaData.filter(lambda x: not x.startswith(">")).map(lambda x: re.findall("[G\|A]",x)).map(lambda x: len(x)).collect()


	GC_content = fastaData.filter(lambda x: not x.startswith(">")).map(lambda x: (re.findall("[G\|A]",x),x)) \
	.map(lambda x: (len(x[0]),len(x[1]))) \
	.map(lambda x: float(x[0])/float(x[1])).map(lambda x: x*100).collect()