jonhurlock/count_occurances_in_stream.py

## count_occurances_in_stream.py
"""
Imagine you have a stream of data.
As long as the stream keeps streaming you must keep reading the data.
You can only read in x bytes of data at a time.

You must count all occurances of a search term y in the stream until
the stream stops.

Edge cases:
search term is empty
search term is larger than the number of bytes currently being read.
need to account for partial matches
multiple matches - stream: "aaa" term: "aa", how many times does aa appear?
do we need to hand "aa\na" as a match for "aa"?
do you want case and white space sensitivity?

Explanation:
I used a file as my stream of data, where consume_stream, reads in a file
'stream_source', we can specify x bytes by using byte_size, and the search
term y.

It will print out found n occurences of the term y.
and return the number of matches as an integer.

To run
>>> print consume_stream('mytest.txt',1,'jon')

I have purposesly not used 'str.count()' or 'in' or regex's
as i wanted to use arrays only. I have used slicing, however,
this i feel can quickly be achieved in other langues.
see: http://bit.ly/1KOKGh9 for a quick explanation of slicing
"""

# we've set this as a global variable
previous_buffer_data = ""

def consume_stream(stream_source, byte_size, search_term):
	global stream_size
	occurences = 0
	# we are using a file here just for testing rather
	# than using an actual 'stream' of data.
	f = open(stream_source, "rb")
	try:
		byte = f.read(byte_size)
		occurences += read_stream_buffer(byte,search_term)
		while(byte != ""):
			byte = f.read(byte_size)
			occurences += read_stream_buffer(byte,search_term)
	finally:
			f.close()
			print 'Found %d occurences of the term \'%s\' found in the stream'%(occurences,search_term)
			return occurences


def read_stream_buffer(buffer_data, search_term):
	occurences = 0
	# if the stream has not ended,
	if (len(buffer_data) != 0):
		# if there was a partial match in the previous buffers
		if len(previous_buffer_data) != 0:
			occurences = search_for_term(previous_buffer_data+buffer_data, search_term)
		else:
			occurences = search_for_term(buffer_data, search_term)
		return occurences
	# if the stream has ended return the number
	# of occurences of the string we found in the stream
	else:
		return occurences


def search_for_term(content_to_search, search_term):
	#print "searching:%s"%content_to_search
	length_of_content = len(content_to_search)
	length_of_search_term = len(search_term)
	# we need access to previous_buffer incase we
	# have a partial match at the end of the buffer
	global previous_buffer_data
	# lets just make sure its empty as we've already
	# passed the previous buffer throuvh via
	# the content_to_search variable
	previous_buffer_data = ""
	# how many matches did we find within this buffer
	occurances = 0
	# if the search term is longer than the
	# content append the context to the previous_buffer_data
	if length_of_search_term > length_of_content:
		previous_buffer_data = content_to_search
	# there should be enough data to search through
	else:
		# counter is like a pivot for the content_to_search
		# we've searched everything to the left of it, if
		# we find a match, we then move the pivot to the end
		# of the match, this stop having sub matches such as
		# how many times does 'aa' occur in 'aaa', should be
		# once, not twice.
		counter = 0
		# try looping through each letter in content
		while counter < length_of_content:
			# character we are trying to match with
			# with the first item of the search string
			c = content_to_search[counter]
			# this is used for tracking the progress of
			# the search with the current pivot point
			matched_letters_from_pivot = 0
			# if the letter matches the first letter in the search term
			if (c == search_term[0]):
				# now for each letter in the search term
				for i in range(0,length_of_search_term):
					# if search term length is still within content length
					if counter+i<length_of_content:
						# if characters match, plus matched items
						if content_to_search[counter+i]==search_term[i]:
							matched_letters_from_pivot+=1
							#print 'match so far'
							# if number of matched characters equals the
							# length of search term, its a full match
							if matched_letters_from_pivot == length_of_search_term:
								occurances +=1
								#print 'match found in:',content_to_search
								# now move the counter to the end of the match
								previous_buffer_data = ""
								counter = counter+length_of_search_term-1
					# if the counter postion + search term length exceeds
					# the content add the content to previous buffer data
					else:
						previous_buffer_data = content_to_search[counter:]
						return occurances
			counter +=1
	return occurances
	"""
	Imagine you have a stream of data.
	As long as the stream keeps streaming you must keep reading the data.
	You can only read in x bytes of data at a time.

	You must count all occurances of a search term y in the stream until
	the stream stops.

	Edge cases:
	search term is empty
	search term is larger than the number of bytes currently being read.
	need to account for partial matches
	multiple matches - stream: "aaa" term: "aa", how many times does aa appear?
	do we need to hand "aa\na" as a match for "aa"?
	do you want case and white space sensitivity?

	Explanation:
	I used a file as my stream of data, where consume_stream, reads in a file
	'stream_source', we can specify x bytes by using byte_size, and the search
	term y.

	It will print out found n occurences of the term y.
	and return the number of matches as an integer.

	To run
	>>> print consume_stream('mytest.txt',1,'jon')

	I have purposesly not used 'str.count()' or 'in' or regex's
	as i wanted to use arrays only. I have used slicing, however,
	this i feel can quickly be achieved in other langues.
	see: http://bit.ly/1KOKGh9 for a quick explanation of slicing
	"""

	# we've set this as a global variable
	previous_buffer_data = ""

	def consume_stream(stream_source, byte_size, search_term):
	global stream_size
	occurences = 0
	# we are using a file here just for testing rather
	# than using an actual 'stream' of data.
	f = open(stream_source, "rb")
	try:
	byte = f.read(byte_size)
	occurences += read_stream_buffer(byte,search_term)
	while(byte != ""):
	byte = f.read(byte_size)
	occurences += read_stream_buffer(byte,search_term)
	finally:
	f.close()
	print 'Found %d occurences of the term \'%s\' found in the stream'%(occurences,search_term)
	return occurences


	def read_stream_buffer(buffer_data, search_term):
	occurences = 0
	# if the stream has not ended,
	if (len(buffer_data) != 0):
	# if there was a partial match in the previous buffers
	if len(previous_buffer_data) != 0:
	occurences = search_for_term(previous_buffer_data+buffer_data, search_term)
	else:
	occurences = search_for_term(buffer_data, search_term)
	return occurences
	# if the stream has ended return the number
	# of occurences of the string we found in the stream
	else:
	return occurences


	def search_for_term(content_to_search, search_term):
	#print "searching:%s"%content_to_search
	length_of_content = len(content_to_search)
	length_of_search_term = len(search_term)
	# we need access to previous_buffer incase we
	# have a partial match at the end of the buffer
	global previous_buffer_data
	# lets just make sure its empty as we've already
	# passed the previous buffer throuvh via
	# the content_to_search variable
	previous_buffer_data = ""
	# how many matches did we find within this buffer
	occurances = 0
	# if the search term is longer than the
	# content append the context to the previous_buffer_data
	if length_of_search_term > length_of_content:
	previous_buffer_data = content_to_search
	# there should be enough data to search through
	else:
	# counter is like a pivot for the content_to_search
	# we've searched everything to the left of it, if
	# we find a match, we then move the pivot to the end
	# of the match, this stop having sub matches such as
	# how many times does 'aa' occur in 'aaa', should be
	# once, not twice.
	counter = 0
	# try looping through each letter in content
	while counter < length_of_content:
	# character we are trying to match with
	# with the first item of the search string
	c = content_to_search[counter]
	# this is used for tracking the progress of
	# the search with the current pivot point
	matched_letters_from_pivot = 0
	# if the letter matches the first letter in the search term
	if (c == search_term[0]):
	# now for each letter in the search term
	for i in range(0,length_of_search_term):
	# if search term length is still within content length
	if counter+i<length_of_content:
	# if characters match, plus matched items
	if content_to_search[counter+i]==search_term[i]:
	matched_letters_from_pivot+=1
	#print 'match so far'
	# if number of matched characters equals the
	# length of search term, its a full match
	if matched_letters_from_pivot == length_of_search_term:
	occurances +=1
	#print 'match found in:',content_to_search
	# now move the counter to the end of the match
	previous_buffer_data = ""
	counter = counter+length_of_search_term-1
	# if the counter postion + search term length exceeds
	# the content add the content to previous buffer data
	else:
	previous_buffer_data = content_to_search[counter:]
	return occurances
	counter +=1
	return occurances