wchan2/.gitignore

## .gitignore
*.pyc
.DS_STORE
records.txt
queries.txt

## README.md

      
    Raw
  

              README.md
            
          
    Word Frequency

Consider a list of sets of words (each set of words is called a record) and a single set of words (called the query). For each word that is not in the query, how many times does the word appear in all records that the entire query is present in? Output a dictionary of words to counts, omitting words with counts of zero. Given a list of records and a list of queries, determine the output for each query with respect to the entire list of records.
Example
Example

The following are examples of a records file and queries file, and the expected output from your solution.
records.txt
red,yellow,green,black
red,green,blue,black
yellow,green,blue
yellow,blue,black

queries.txt
blue,yellow
black,green

expected output
{"black": 1, "green": 1}
{"blue": 1, "red": 2, "yellow": 1}

Challenge

Download and unarchive word_frequency.zip from the following link: https://outbrain.box.com/shared/static/ti3rxupnk6x4jj6d0a6bgkbs804to5om.zip
Inside there are two files, records.txt and queries.txt. The records file contains a large list of sets of words to be used as training data. The queries file contains individual sets of words to be interpreted as queries against all of the records.

Each file consists of ASCII-encoded text.
Each line of the files contains a set of comma-separated words.
Aside from the separating commas, there is no other punctuation in the files.
None of the words contain whitespace.
Words are case-sensitive.
No word appears more than once in the same line.
For each query, use the following function to output your result dictionary:
from json import dumps

def output(dictionary):    
    print(dumps(dictionary, sort_keys=True))
Your solution should be implemented in either Python or Java.

  
## word_frequency.py
from collections import defaultdict
from itertools import combinations
from multiprocessing import Pool

import time
import math
import json

class Index:
	def index(self, record):
		raise Exception('#index not implemented for Index sub-type')
	def search(self, query):
		raise Exception('#match not implemented for Index sub-type')

class CombinationIndex(Index):
	def __init__(self):
		self.indices = defaultdict(list)
	def index(self, record):
		sorted_record = sorted(record)
		for i in range(1, len(sorted_record) + 1):
			for key in combinations(sorted_record, i):
				self.indices[self._normalize(key)].append(record)
	def search(self, query):
		return self.indices[self._normalize(query)]
	def _normalize(self, datum):
		normalized_list = [piece.lower() for piece in datum]
		return tuple(sorted(normalized_list))

class Data:
	def __init__(self, indexer):
		self.indexer = indexer
		self.data = []
	def index(self, datum):
		self.data.append(datum)
		self.indexer.index(datum)
	def search(self, query):
		if len(query) == 0:
			return self.data
		return self.indexer.search(query)

def output(dictionary):
	print(json.dumps(dictionary, sort_keys=True))

def time_it(fn):
	start = time.time()
	fn()
	elapsed = time.time() - start
	print('time elapsed: %f' % elapsed)

def count_words(results):
	word_count = defaultdict(int)
	for row in results:
		for word in row:
			word_count[word] += 1
	return dict(word_count)

def sequential_execute():
	data = Data(CombinationIndex())
	with open('records.txt', 'r') as records:
		lines = records.read().split('\n')
		for line in lines:
			# print('indexing line: %s' % line)
			data.index(tuple(line.split(',')))
			# print('finished indexing')

	with open('queries.txt', 'r') as queries:
		for query in queries:
			results = data.search(tuple(query.split(',')))
			word_count = count_words(results)
			for word in query.split(','):
				del word_count[word]
			outputs(word_count)

def parallel_execute(process_count):
	pool = Pool(process_count)

	with open('records.txt', 'r') as records:
		lines = records.read().split('\n')
		datasets = pool.map(index_data, lines, math.ceil(len(lines)/process_count))

	with open('queries.txt', 'r') as queries:
		for query in queries:
			results = []
			for data in datasets:
				results.extend(data.search(tuple(query.split(','))))
				word_count = count_words(results)
				for word in query.split(','):
					del word_count[word]
				outputs(word_count)

def index_data(lines):
	data = Data(CombinationIndex())
	for line in lines:
		data.index(tuple(line.split(',')))
	return data

if __name__ == '__main__':
	time_it(lambda: parallel_execute(4))
	# time_it(sequential_execute)

## word_frequency_2.py
from multiprocessing import Process, Queue, cpu_count
from itertools import combinations
from collections import defaultdict

from word_frequency import count_words, output

import math

class Record:
	def __init__(self, data):
		self.data = data
	def match(self, query):
		normalized_data = self._normalize(self.data)
		indices = [self._normalize(combination) for combination in combinations(normalized_data, len(query))]

		return self._normalize(query) in indices
	def _normalize(self, query):
		return tuple(sorted([piece.lower() for piece in query]))

class SearchWorker(Process):
	def __init__(self, record_batch, search_requests, search_results):
		super(SearchWorker, self).__init__()
		self.daemon = True
		self.record_batch = record_batch
		self.search_requests = search_requests
		self.search_results = search_results
	def run(self):
		for query in iter(self.search_requests.get, None):
			matches = [record.data for record in self.record_batch if record.match(tuple(query))]
			self.search_results.put({tuple(query): matches})
		self.search_results.put(None)

def batches(elements, batch_count):
	batch_size = math.ceil(len(rows) / batch_count)
	return [elements[i:i+batch_size] for i in range(0, len(elements), batch_size)]

if __name__ == '__main__':
	with open('records.txt', 'r') as records:
		rows = [Record(tuple(record.strip().split(','))) for record in records]

	with open('queries.txt', 'r') as queries:
		lines = queries.read().split('\n')

		# create queues and processes
		search_results = Queue()
		search_queues = [Queue() for i in range(cpu_count())]
		processes = [SearchWorker(batch, search_queues[i], search_results) for (i, batch) in enumerate(batches(rows, cpu_count()))]
		for process in processes:
			process.start()

		# send search queries to workers
		for query in lines:
			for search in search_queues:
				search.put(query.split(','))

		# end search queries
		for search in search_queues:
			search.put(None)

		# reduce results
		results = defaultdict(list)
		for result in iter(search_results.get, None):
			for key in result:
				if len(result[key]) > 0:
					results[key].extend(result[key])
		search_results.close()

		# terminate processes
		for process in processes:
			process.terminate()

		# delete and output the word count
		for key in results:
			word_counts = count_words(results[key])
			for word in key:
				del word_counts[word]
			output(word_counts)

## word_frequency_2_test.py
from multiprocessing import Queue
from unittest.mock import MagicMock

from word_frequency_2 import Record, SearchWorker

import unittest

class TestRecord(unittest.TestCase):
	def setUp(self):
		self.record_with_ample_data = Record(('red', 'Blue', 'black', 'yellow', 'Pink', 'orange'))
		self.record_with_little_data = Record(('red', 'blue', 'Black'))
		self.record_with_multiple_entries = Record(('red', 'blue', 'blue', 'orange'))

	def test_constructor(self):
		self.assertEqual(self.record_with_ample_data.data, ('red', 'Blue', 'black', 'yellow', 'Pink', 'orange'))
		self.assertEqual(self.record_with_little_data.data, ('red', 'blue', 'Black'))

	def test_shorter_data_match(self):
		self.assertTrue(self.record_with_little_data.match(('red',)))
		self.assertTrue(self.record_with_little_data.match(('black',)))
		self.assertTrue(self.record_with_little_data.match(('blue',)))
		self.assertTrue(self.record_with_little_data.match(('black', 'blue')))
		self.assertTrue(self.record_with_little_data.match(('black', 'red')))
		self.assertTrue(self.record_with_little_data.match(('blue', 'red')))

	def test_shorter_data_does_not_match(self):
		self.assertFalse(self.record_with_little_data.match(('black', 'red', 'yellow', 'green')))
		self.assertFalse(self.record_with_little_data.match(('black', 'red', 'pink')))
		self.assertFalse(self.record_with_little_data.match(('black', 'black')))

	def test_longer_data_match(self):
		self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue')))
		self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange')))
		self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'yellow', 'PINK')))
		self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'pink')))

	def test_longer_data_does_not_match(self):
		self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple')))
		self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple', 'orange')))
		self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple', 'yellow', 'pink')))
		self.assertFalse(self.record_with_ample_data.match(('cyan',)))

	def test_multiple_entries_match(self):
		self.assertTrue(self.record_with_multiple_entries.match(('red',)))
		self.assertTrue(self.record_with_multiple_entries.match(('blue',)))
		self.assertTrue(self.record_with_multiple_entries.match(('blue', 'blue')))
		self.assertTrue(self.record_with_multiple_entries.match(('blue', 'red', 'blue')))
		self.assertTrue(self.record_with_multiple_entries.match(('blue', 'red', 'blue', 'orange')))

	def test_multiple_entries_does_not_match(self):
		self.assertFalse(self.record_with_multiple_entries.match(('red', 'red')))
		self.assertFalse(self.record_with_multiple_entries.match(('red', 'pink')))
		self.assertFalse(self.record_with_multiple_entries.match(('blue', 'blue', 'blue')))

class TestSearchWorker(unittest.TestCase):
	def setUp(self):
		self.fake_record_with_match = Record(('fake_data',))
		self.fake_record_with_match.match = MagicMock(return_value=True)
		self.fake_record_without_match = Record(('more_fake_data',))
		self.fake_record_without_match.match = MagicMock(return_value=False)

		self.send_queue = Queue()
		self.receive_queue = Queue()

	def tearDown(self):
		self.fake_record_with_match = None
		self.fake_record_without_match = None

		self.send_queue.close()
		self.receive_queue.close()
		self.send_queue = None
		self.receive_queue = None

	def test_constructor(self):
		worker = SearchWorker([self.fake_record_without_match, self.fake_record_with_match], self.send_queue, self.receive_queue)
		self.assertEqual(worker.search_requests, self.send_queue)
		self.assertEqual(worker.search_results, self.receive_queue)
		self.assertEqual(worker.record_batch, [self.fake_record_without_match, self.fake_record_with_match])

	def test_match(self):
		worker = SearchWorker([self.fake_record_with_match, self.fake_record_without_match], self.send_queue, self.receive_queue)
		worker.start()
		self.send_queue.put(('search test',))

		self.fake_record_with_match.match.called_with(('search test',))
		self.fake_record_without_match.match.called_with(('search test',))
		self.assertEqual(self.receive_queue.get(), {('search test',): [('fake_data',)]})

		self.send_queue.put(None)
		self.assertIsNone(self.receive_queue.get())
		worker.terminate()

if __name__ == '__main__':
	unittest.main()

## word_frequency_test.py
import unittest

from unittest.mock import MagicMock
from word_frequency import Index, CombinationIndex, Data, count_words

class TestCombinationIndex(unittest.TestCase):
	def setUp(self):
		self.records = [
			('red', 'yellow', 'green', 'black'),
			('red', 'green', 'blue', 'black'),
			('yellow', 'green', 'blue'),
			('yellow', 'blue', 'black')
		]
		self.queries = [
			('blue', 'yellow'),
			('black', 'green')
		]
		self.indexer = CombinationIndex()
	def tearDown(self):
		self.records = None
		self.queries = None
		self.indexer = None

	def test_index_single_item(self):
		self.indexer.index(('red',))
		self.assertEqual(sorted(self.indexer.indices.keys()), [('red',)])

	def test_index_multiple_items(self):
		expected = [
			('black',), ('black', 'green'), ('black', 'green', 'red'), ('black', 'green', 'red', 'yellow'),
			('black', 'red', 'yellow'), ('black', 'green', 'yellow'),
			('black', 'red'), ('black', 'yellow'),
			('green',), ('green', 'red'), ('green', 'red', 'yellow'),
			('green', 'yellow'),
			('red',), ('red', 'yellow'),
			('yellow',)
		]
		self.indexer.index(('red', 'yellow', 'green', 'black'))
		self.assertEqual(sorted(self.indexer.indices.keys()), sorted(expected))

	def test_search_single_item_without_match(self):
		self.indexer.index(('red',))
		self.assertEqual(self.indexer.search(('black',)), [])

	def test_search_single_item_without_match(self):
		self.indexer.index(('red',))
		self.assertEqual(self.indexer.search(('black', 'red')), [])
		self.assertEqual(self.indexer.search(('black',)), [])

	def test_search_single_item_with_match(self):
		self.indexer.index(('red',))
		self.assertEqual(self.indexer.search(('red',)), [('red',)])

	def test_search_multiple_item_without_match(self):
		self.indexer.index(('red', 'black'))
		self.assertEqual(self.indexer.search(('blue', 'red')), [])
		self.assertEqual(self.indexer.search(('red', 'blue')), [])
		self.assertEqual(self.indexer.search(('black', 'blue')), [])

	def test_search_multiple_item_with_match(self):
		record = [('red', 'black', 'yellow', 'green')]
		self.indexer.index(record[0])
		self.assertEqual(self.indexer.search(('black',)), record)
		self.assertEqual(self.indexer.search(('red',)), record)
		self.assertEqual(self.indexer.search(('red', 'black')), record)
		self.assertEqual(self.indexer.search(('black', 'red')), record)
		self.assertEqual(self.indexer.search(('yellow', 'red')), record)
		self.assertEqual(self.indexer.search(('red', 'yellow')), record)
		self.assertEqual(self.indexer.search(('yellow', 'black')), record)
		self.assertEqual(self.indexer.search(('black', 'yellow')), record)
		self.assertEqual(self.indexer.search(('black', 'yellow', 'red')), record)
		self.assertEqual(self.indexer.search(('black', 'green', 'red')), record)
		self.assertEqual(self.indexer.search(('black', 'yellow', 'green')), record)

	def test_multiple_indexed_records_with_match(self):
		records = [('red', 'black', 'yellow', 'green'), ('red', 'green', 'blue')]
		self.indexer.index(records[0])
		self.indexer.index(records[1])
		self.assertEqual(self.indexer.search(('red', 'black')), [records[0]])
		self.assertEqual(self.indexer.search(('red', 'black', 'green')), [records[0]])
		self.assertEqual(self.indexer.search(('red', 'green')), records)
		self.assertEqual(self.indexer.search(('blue', 'red', 'green')), [records[1]])
		self.assertEqual(self.indexer.search(('red', 'green', 'blue')), [records[1]])
		self.assertEqual(self.indexer.search(('red', 'green')), records)

class TestData(unittest.TestCase):
	def setUp(self):
		self.indexer = Index()
		self.indexer.index = MagicMock()
		self.indexer.search = MagicMock(return_value='found')
		self.data = Data(self.indexer)

	def tearDown(self):
		self.indexer = None
		self.data = None

	def test_constructor(self):
		self.assertEqual(self.data.indexer, self.indexer)
		self.assertEqual(self.data.data, [])

	def test_index(self):
		self.data.index('test_data')
		self.indexer.index.assert_called_with('test_data')
		self.assertEqual(self.data.data, ['test_data'])

	def test_search_with_empty_query(self):
		self.data.data = ['fake']
		self.assertEqual(self.data.search(()), ['fake'])
		self.assertFalse(self.indexer.search.called)

	def test_search_with_non_empty_query(self):
		self.data.search('test_query')
		self.indexer.search.assert_called_with('test_query')

class TestCountWords(unittest.TestCase):
	def test_count(self):
		expected = {'test': 2, 'hello': 2, 'world': 1, 'bingo': 3, 'no way': 1}
		test = [
			['test', 'hello', 'world', 'test', 'no way', 'bingo', 'bingo'],
			['bingo', 'hello'],
		]
		self.assertEqual(count_words(test), expected)
		self.assertEqual(count_words([[]]), {})
		self.assertEqual(count_words([]), {})


if __name__ == '__main__':
	unittest.main()
	from collections import defaultdict
	from itertools import combinations
	from multiprocessing import Pool

	import time
	import math
	import json

	class Index:
	def index(self, record):
	raise Exception('#index not implemented for Index sub-type')
	def search(self, query):
	raise Exception('#match not implemented for Index sub-type')

	class CombinationIndex(Index):
	def __init__(self):
	self.indices = defaultdict(list)
	def index(self, record):
	sorted_record = sorted(record)
	for i in range(1, len(sorted_record) + 1):
	for key in combinations(sorted_record, i):
	self.indices[self._normalize(key)].append(record)
	def search(self, query):
	return self.indices[self._normalize(query)]
	def _normalize(self, datum):
	normalized_list = [piece.lower() for piece in datum]
	return tuple(sorted(normalized_list))

	class Data:
	def __init__(self, indexer):
	self.indexer = indexer
	self.data = []
	def index(self, datum):
	self.data.append(datum)
	self.indexer.index(datum)
	def search(self, query):
	if len(query) == 0:
	return self.data
	return self.indexer.search(query)

	def output(dictionary):
	print(json.dumps(dictionary, sort_keys=True))

	def time_it(fn):
	start = time.time()
	fn()
	elapsed = time.time() - start
	print('time elapsed: %f' % elapsed)

	def count_words(results):
	word_count = defaultdict(int)
	for row in results:
	for word in row:
	word_count[word] += 1
	return dict(word_count)

	def sequential_execute():
	data = Data(CombinationIndex())
	with open('records.txt', 'r') as records:
	lines = records.read().split('\n')
	for line in lines:
	# print('indexing line: %s' % line)
	data.index(tuple(line.split(',')))
	# print('finished indexing')

	with open('queries.txt', 'r') as queries:
	for query in queries:
	results = data.search(tuple(query.split(',')))
	word_count = count_words(results)
	for word in query.split(','):
	del word_count[word]
	outputs(word_count)

	def parallel_execute(process_count):
	pool = Pool(process_count)

	with open('records.txt', 'r') as records:
	lines = records.read().split('\n')
	datasets = pool.map(index_data, lines, math.ceil(len(lines)/process_count))

	with open('queries.txt', 'r') as queries:
	for query in queries:
	results = []
	for data in datasets:
	results.extend(data.search(tuple(query.split(','))))
	word_count = count_words(results)
	for word in query.split(','):
	del word_count[word]
	outputs(word_count)

	def index_data(lines):
	data = Data(CombinationIndex())
	for line in lines:
	data.index(tuple(line.split(',')))
	return data

	if __name__ == '__main__':
	time_it(lambda: parallel_execute(4))
	# time_it(sequential_execute)
	from multiprocessing import Process, Queue, cpu_count
	from itertools import combinations
	from collections import defaultdict

	from word_frequency import count_words, output

	import math

	class Record:
	def __init__(self, data):
	self.data = data
	def match(self, query):
	normalized_data = self._normalize(self.data)
	indices = [self._normalize(combination) for combination in combinations(normalized_data, len(query))]

	return self._normalize(query) in indices
	def _normalize(self, query):
	return tuple(sorted([piece.lower() for piece in query]))

	class SearchWorker(Process):
	def __init__(self, record_batch, search_requests, search_results):
	super(SearchWorker, self).__init__()
	self.daemon = True
	self.record_batch = record_batch
	self.search_requests = search_requests
	self.search_results = search_results
	def run(self):
	for query in iter(self.search_requests.get, None):
	matches = [record.data for record in self.record_batch if record.match(tuple(query))]
	self.search_results.put({tuple(query): matches})
	self.search_results.put(None)

	def batches(elements, batch_count):
	batch_size = math.ceil(len(rows) / batch_count)
	return [elements[i:i+batch_size] for i in range(0, len(elements), batch_size)]

	if __name__ == '__main__':
	with open('records.txt', 'r') as records:
	rows = [Record(tuple(record.strip().split(','))) for record in records]

	with open('queries.txt', 'r') as queries:
	lines = queries.read().split('\n')

	# create queues and processes
	search_results = Queue()
	search_queues = [Queue() for i in range(cpu_count())]
	processes = [SearchWorker(batch, search_queues[i], search_results) for (i, batch) in enumerate(batches(rows, cpu_count()))]
	for process in processes:
	process.start()

	# send search queries to workers
	for query in lines:
	for search in search_queues:
	search.put(query.split(','))

	# end search queries
	for search in search_queues:
	search.put(None)

	# reduce results
	results = defaultdict(list)
	for result in iter(search_results.get, None):
	for key in result:
	if len(result[key]) > 0:
	results[key].extend(result[key])
	search_results.close()

	# terminate processes
	for process in processes:
	process.terminate()

	# delete and output the word count
	for key in results:
	word_counts = count_words(results[key])
	for word in key:
	del word_counts[word]
	output(word_counts)
	from multiprocessing import Queue
	from unittest.mock import MagicMock

	from word_frequency_2 import Record, SearchWorker

	import unittest

	class TestRecord(unittest.TestCase):
	def setUp(self):
	self.record_with_ample_data = Record(('red', 'Blue', 'black', 'yellow', 'Pink', 'orange'))
	self.record_with_little_data = Record(('red', 'blue', 'Black'))
	self.record_with_multiple_entries = Record(('red', 'blue', 'blue', 'orange'))

	def test_constructor(self):
	self.assertEqual(self.record_with_ample_data.data, ('red', 'Blue', 'black', 'yellow', 'Pink', 'orange'))
	self.assertEqual(self.record_with_little_data.data, ('red', 'blue', 'Black'))

	def test_shorter_data_match(self):
	self.assertTrue(self.record_with_little_data.match(('red',)))
	self.assertTrue(self.record_with_little_data.match(('black',)))
	self.assertTrue(self.record_with_little_data.match(('blue',)))
	self.assertTrue(self.record_with_little_data.match(('black', 'blue')))
	self.assertTrue(self.record_with_little_data.match(('black', 'red')))
	self.assertTrue(self.record_with_little_data.match(('blue', 'red')))

	def test_shorter_data_does_not_match(self):
	self.assertFalse(self.record_with_little_data.match(('black', 'red', 'yellow', 'green')))
	self.assertFalse(self.record_with_little_data.match(('black', 'red', 'pink')))
	self.assertFalse(self.record_with_little_data.match(('black', 'black')))

	def test_longer_data_match(self):
	self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue')))
	self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange')))
	self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'yellow', 'PINK')))
	self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'pink')))

	def test_longer_data_does_not_match(self):
	self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple')))
	self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple', 'orange')))
	self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple', 'yellow', 'pink')))
	self.assertFalse(self.record_with_ample_data.match(('cyan',)))

	def test_multiple_entries_match(self):
	self.assertTrue(self.record_with_multiple_entries.match(('red',)))
	self.assertTrue(self.record_with_multiple_entries.match(('blue',)))
	self.assertTrue(self.record_with_multiple_entries.match(('blue', 'blue')))
	self.assertTrue(self.record_with_multiple_entries.match(('blue', 'red', 'blue')))
	self.assertTrue(self.record_with_multiple_entries.match(('blue', 'red', 'blue', 'orange')))

	def test_multiple_entries_does_not_match(self):
	self.assertFalse(self.record_with_multiple_entries.match(('red', 'red')))
	self.assertFalse(self.record_with_multiple_entries.match(('red', 'pink')))
	self.assertFalse(self.record_with_multiple_entries.match(('blue', 'blue', 'blue')))

	class TestSearchWorker(unittest.TestCase):
	def setUp(self):
	self.fake_record_with_match = Record(('fake_data',))
	self.fake_record_with_match.match = MagicMock(return_value=True)
	self.fake_record_without_match = Record(('more_fake_data',))
	self.fake_record_without_match.match = MagicMock(return_value=False)

	self.send_queue = Queue()
	self.receive_queue = Queue()

	def tearDown(self):
	self.fake_record_with_match = None
	self.fake_record_without_match = None

	self.send_queue.close()
	self.receive_queue.close()
	self.send_queue = None
	self.receive_queue = None

	def test_constructor(self):
	worker = SearchWorker([self.fake_record_without_match, self.fake_record_with_match], self.send_queue, self.receive_queue)
	self.assertEqual(worker.search_requests, self.send_queue)
	self.assertEqual(worker.search_results, self.receive_queue)
	self.assertEqual(worker.record_batch, [self.fake_record_without_match, self.fake_record_with_match])

	def test_match(self):
	worker = SearchWorker([self.fake_record_with_match, self.fake_record_without_match], self.send_queue, self.receive_queue)
	worker.start()
	self.send_queue.put(('search test',))

	self.fake_record_with_match.match.called_with(('search test',))
	self.fake_record_without_match.match.called_with(('search test',))
	self.assertEqual(self.receive_queue.get(), {('search test',): [('fake_data',)]})

	self.send_queue.put(None)
	self.assertIsNone(self.receive_queue.get())
	worker.terminate()

	if __name__ == '__main__':
	unittest.main()
	import unittest

	from unittest.mock import MagicMock
	from word_frequency import Index, CombinationIndex, Data, count_words

	class TestCombinationIndex(unittest.TestCase):
	def setUp(self):
	self.records = [
	('red', 'yellow', 'green', 'black'),
	('red', 'green', 'blue', 'black'),
	('yellow', 'green', 'blue'),
	('yellow', 'blue', 'black')
	]
	self.queries = [
	('blue', 'yellow'),
	('black', 'green')
	]
	self.indexer = CombinationIndex()
	def tearDown(self):
	self.records = None
	self.queries = None
	self.indexer = None

	def test_index_single_item(self):
	self.indexer.index(('red',))
	self.assertEqual(sorted(self.indexer.indices.keys()), [('red',)])

	def test_index_multiple_items(self):
	expected = [
	('black',), ('black', 'green'), ('black', 'green', 'red'), ('black', 'green', 'red', 'yellow'),
	('black', 'red', 'yellow'), ('black', 'green', 'yellow'),
	('black', 'red'), ('black', 'yellow'),
	('green',), ('green', 'red'), ('green', 'red', 'yellow'),
	('green', 'yellow'),
	('red',), ('red', 'yellow'),
	('yellow',)
	]
	self.indexer.index(('red', 'yellow', 'green', 'black'))
	self.assertEqual(sorted(self.indexer.indices.keys()), sorted(expected))

	def test_search_single_item_without_match(self):
	self.indexer.index(('red',))
	self.assertEqual(self.indexer.search(('black',)), [])

	def test_search_single_item_without_match(self):
	self.indexer.index(('red',))
	self.assertEqual(self.indexer.search(('black', 'red')), [])
	self.assertEqual(self.indexer.search(('black',)), [])

	def test_search_single_item_with_match(self):
	self.indexer.index(('red',))
	self.assertEqual(self.indexer.search(('red',)), [('red',)])

	def test_search_multiple_item_without_match(self):
	self.indexer.index(('red', 'black'))
	self.assertEqual(self.indexer.search(('blue', 'red')), [])
	self.assertEqual(self.indexer.search(('red', 'blue')), [])
	self.assertEqual(self.indexer.search(('black', 'blue')), [])

	def test_search_multiple_item_with_match(self):
	record = [('red', 'black', 'yellow', 'green')]
	self.indexer.index(record[0])
	self.assertEqual(self.indexer.search(('black',)), record)
	self.assertEqual(self.indexer.search(('red',)), record)
	self.assertEqual(self.indexer.search(('red', 'black')), record)
	self.assertEqual(self.indexer.search(('black', 'red')), record)
	self.assertEqual(self.indexer.search(('yellow', 'red')), record)
	self.assertEqual(self.indexer.search(('red', 'yellow')), record)
	self.assertEqual(self.indexer.search(('yellow', 'black')), record)
	self.assertEqual(self.indexer.search(('black', 'yellow')), record)
	self.assertEqual(self.indexer.search(('black', 'yellow', 'red')), record)
	self.assertEqual(self.indexer.search(('black', 'green', 'red')), record)
	self.assertEqual(self.indexer.search(('black', 'yellow', 'green')), record)

	def test_multiple_indexed_records_with_match(self):
	records = [('red', 'black', 'yellow', 'green'), ('red', 'green', 'blue')]
	self.indexer.index(records[0])
	self.indexer.index(records[1])
	self.assertEqual(self.indexer.search(('red', 'black')), [records[0]])
	self.assertEqual(self.indexer.search(('red', 'black', 'green')), [records[0]])
	self.assertEqual(self.indexer.search(('red', 'green')), records)
	self.assertEqual(self.indexer.search(('blue', 'red', 'green')), [records[1]])
	self.assertEqual(self.indexer.search(('red', 'green', 'blue')), [records[1]])
	self.assertEqual(self.indexer.search(('red', 'green')), records)

	class TestData(unittest.TestCase):
	def setUp(self):
	self.indexer = Index()
	self.indexer.index = MagicMock()
	self.indexer.search = MagicMock(return_value='found')
	self.data = Data(self.indexer)

	def tearDown(self):
	self.indexer = None
	self.data = None

	def test_constructor(self):
	self.assertEqual(self.data.indexer, self.indexer)
	self.assertEqual(self.data.data, [])

	def test_index(self):
	self.data.index('test_data')
	self.indexer.index.assert_called_with('test_data')
	self.assertEqual(self.data.data, ['test_data'])

	def test_search_with_empty_query(self):
	self.data.data = ['fake']
	self.assertEqual(self.data.search(()), ['fake'])
	self.assertFalse(self.indexer.search.called)

	def test_search_with_non_empty_query(self):
	self.data.search('test_query')
	self.indexer.search.assert_called_with('test_query')

	class TestCountWords(unittest.TestCase):
	def test_count(self):
	expected = {'test': 2, 'hello': 2, 'world': 1, 'bingo': 3, 'no way': 1}
	test = [
	['test', 'hello', 'world', 'test', 'no way', 'bingo', 'bingo'],
	['bingo', 'hello'],
	]
	self.assertEqual(count_words(test), expected)
	self.assertEqual(count_words([[]]), {})
	self.assertEqual(count_words([]), {})


	if __name__ == '__main__':
	unittest.main()