Skip to content

Instantly share code, notes, and snippets.

@wchan2
Last active August 29, 2015 14:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wchan2/02edb65d7d2e722be594 to your computer and use it in GitHub Desktop.
Save wchan2/02edb65d7d2e722be594 to your computer and use it in GitHub Desktop.
Word Frequency Search
*.pyc
.DS_STORE
records.txt
queries.txt

Word Frequency

Consider a list of sets of words (each set of words is called a record) and a single set of words (called the query). For each word that is not in the query, how many times does the word appear in all records that the entire query is present in? Output a dictionary of words to counts, omitting words with counts of zero. Given a list of records and a list of queries, determine the output for each query with respect to the entire list of records. Example

Example

The following are examples of a records file and queries file, and the expected output from your solution.

records.txt

red,yellow,green,black
red,green,blue,black
yellow,green,blue
yellow,blue,black

queries.txt

blue,yellow
black,green

expected output

{"black": 1, "green": 1}
{"blue": 1, "red": 2, "yellow": 1}

Challenge

Download and unarchive word_frequency.zip from the following link: https://outbrain.box.com/shared/static/ti3rxupnk6x4jj6d0a6bgkbs804to5om.zip

Inside there are two files, records.txt and queries.txt. The records file contains a large list of sets of words to be used as training data. The queries file contains individual sets of words to be interpreted as queries against all of the records.

  • Each file consists of ASCII-encoded text.
  • Each line of the files contains a set of comma-separated words.
  • Aside from the separating commas, there is no other punctuation in the files.
  • None of the words contain whitespace.
  • Words are case-sensitive.
  • No word appears more than once in the same line.
  • For each query, use the following function to output your result dictionary: from json import dumps
def output(dictionary):    
    print(dumps(dictionary, sort_keys=True))

Your solution should be implemented in either Python or Java.

from collections import defaultdict
from itertools import combinations
from multiprocessing import Pool
import time
import math
import json
class Index:
def index(self, record):
raise Exception('#index not implemented for Index sub-type')
def search(self, query):
raise Exception('#match not implemented for Index sub-type')
class CombinationIndex(Index):
def __init__(self):
self.indices = defaultdict(list)
def index(self, record):
sorted_record = sorted(record)
for i in range(1, len(sorted_record) + 1):
for key in combinations(sorted_record, i):
self.indices[self._normalize(key)].append(record)
def search(self, query):
return self.indices[self._normalize(query)]
def _normalize(self, datum):
normalized_list = [piece.lower() for piece in datum]
return tuple(sorted(normalized_list))
class Data:
def __init__(self, indexer):
self.indexer = indexer
self.data = []
def index(self, datum):
self.data.append(datum)
self.indexer.index(datum)
def search(self, query):
if len(query) == 0:
return self.data
return self.indexer.search(query)
def output(dictionary):
print(json.dumps(dictionary, sort_keys=True))
def time_it(fn):
start = time.time()
fn()
elapsed = time.time() - start
print('time elapsed: %f' % elapsed)
def count_words(results):
word_count = defaultdict(int)
for row in results:
for word in row:
word_count[word] += 1
return dict(word_count)
def sequential_execute():
data = Data(CombinationIndex())
with open('records.txt', 'r') as records:
lines = records.read().split('\n')
for line in lines:
# print('indexing line: %s' % line)
data.index(tuple(line.split(',')))
# print('finished indexing')
with open('queries.txt', 'r') as queries:
for query in queries:
results = data.search(tuple(query.split(',')))
word_count = count_words(results)
for word in query.split(','):
del word_count[word]
outputs(word_count)
def parallel_execute(process_count):
pool = Pool(process_count)
with open('records.txt', 'r') as records:
lines = records.read().split('\n')
datasets = pool.map(index_data, lines, math.ceil(len(lines)/process_count))
with open('queries.txt', 'r') as queries:
for query in queries:
results = []
for data in datasets:
results.extend(data.search(tuple(query.split(','))))
word_count = count_words(results)
for word in query.split(','):
del word_count[word]
outputs(word_count)
def index_data(lines):
data = Data(CombinationIndex())
for line in lines:
data.index(tuple(line.split(',')))
return data
if __name__ == '__main__':
time_it(lambda: parallel_execute(4))
# time_it(sequential_execute)
from multiprocessing import Process, Queue, cpu_count
from itertools import combinations
from collections import defaultdict
from word_frequency import count_words, output
import math
class Record:
def __init__(self, data):
self.data = data
def match(self, query):
normalized_data = self._normalize(self.data)
indices = [self._normalize(combination) for combination in combinations(normalized_data, len(query))]
return self._normalize(query) in indices
def _normalize(self, query):
return tuple(sorted([piece.lower() for piece in query]))
class SearchWorker(Process):
def __init__(self, record_batch, search_requests, search_results):
super(SearchWorker, self).__init__()
self.daemon = True
self.record_batch = record_batch
self.search_requests = search_requests
self.search_results = search_results
def run(self):
for query in iter(self.search_requests.get, None):
matches = [record.data for record in self.record_batch if record.match(tuple(query))]
self.search_results.put({tuple(query): matches})
self.search_results.put(None)
def batches(elements, batch_count):
batch_size = math.ceil(len(rows) / batch_count)
return [elements[i:i+batch_size] for i in range(0, len(elements), batch_size)]
if __name__ == '__main__':
with open('records.txt', 'r') as records:
rows = [Record(tuple(record.strip().split(','))) for record in records]
with open('queries.txt', 'r') as queries:
lines = queries.read().split('\n')
# create queues and processes
search_results = Queue()
search_queues = [Queue() for i in range(cpu_count())]
processes = [SearchWorker(batch, search_queues[i], search_results) for (i, batch) in enumerate(batches(rows, cpu_count()))]
for process in processes:
process.start()
# send search queries to workers
for query in lines:
for search in search_queues:
search.put(query.split(','))
# end search queries
for search in search_queues:
search.put(None)
# reduce results
results = defaultdict(list)
for result in iter(search_results.get, None):
for key in result:
if len(result[key]) > 0:
results[key].extend(result[key])
search_results.close()
# terminate processes
for process in processes:
process.terminate()
# delete and output the word count
for key in results:
word_counts = count_words(results[key])
for word in key:
del word_counts[word]
output(word_counts)
from multiprocessing import Queue
from unittest.mock import MagicMock
from word_frequency_2 import Record, SearchWorker
import unittest
class TestRecord(unittest.TestCase):
def setUp(self):
self.record_with_ample_data = Record(('red', 'Blue', 'black', 'yellow', 'Pink', 'orange'))
self.record_with_little_data = Record(('red', 'blue', 'Black'))
self.record_with_multiple_entries = Record(('red', 'blue', 'blue', 'orange'))
def test_constructor(self):
self.assertEqual(self.record_with_ample_data.data, ('red', 'Blue', 'black', 'yellow', 'Pink', 'orange'))
self.assertEqual(self.record_with_little_data.data, ('red', 'blue', 'Black'))
def test_shorter_data_match(self):
self.assertTrue(self.record_with_little_data.match(('red',)))
self.assertTrue(self.record_with_little_data.match(('black',)))
self.assertTrue(self.record_with_little_data.match(('blue',)))
self.assertTrue(self.record_with_little_data.match(('black', 'blue')))
self.assertTrue(self.record_with_little_data.match(('black', 'red')))
self.assertTrue(self.record_with_little_data.match(('blue', 'red')))
def test_shorter_data_does_not_match(self):
self.assertFalse(self.record_with_little_data.match(('black', 'red', 'yellow', 'green')))
self.assertFalse(self.record_with_little_data.match(('black', 'red', 'pink')))
self.assertFalse(self.record_with_little_data.match(('black', 'black')))
def test_longer_data_match(self):
self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue')))
self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange')))
self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'yellow', 'PINK')))
self.assertTrue(self.record_with_ample_data.match(('black', 'red', 'blue', 'pink')))
def test_longer_data_does_not_match(self):
self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple')))
self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple', 'orange')))
self.assertFalse(self.record_with_ample_data.match(('black', 'red', 'blue', 'orange', 'purple', 'yellow', 'pink')))
self.assertFalse(self.record_with_ample_data.match(('cyan',)))
def test_multiple_entries_match(self):
self.assertTrue(self.record_with_multiple_entries.match(('red',)))
self.assertTrue(self.record_with_multiple_entries.match(('blue',)))
self.assertTrue(self.record_with_multiple_entries.match(('blue', 'blue')))
self.assertTrue(self.record_with_multiple_entries.match(('blue', 'red', 'blue')))
self.assertTrue(self.record_with_multiple_entries.match(('blue', 'red', 'blue', 'orange')))
def test_multiple_entries_does_not_match(self):
self.assertFalse(self.record_with_multiple_entries.match(('red', 'red')))
self.assertFalse(self.record_with_multiple_entries.match(('red', 'pink')))
self.assertFalse(self.record_with_multiple_entries.match(('blue', 'blue', 'blue')))
class TestSearchWorker(unittest.TestCase):
def setUp(self):
self.fake_record_with_match = Record(('fake_data',))
self.fake_record_with_match.match = MagicMock(return_value=True)
self.fake_record_without_match = Record(('more_fake_data',))
self.fake_record_without_match.match = MagicMock(return_value=False)
self.send_queue = Queue()
self.receive_queue = Queue()
def tearDown(self):
self.fake_record_with_match = None
self.fake_record_without_match = None
self.send_queue.close()
self.receive_queue.close()
self.send_queue = None
self.receive_queue = None
def test_constructor(self):
worker = SearchWorker([self.fake_record_without_match, self.fake_record_with_match], self.send_queue, self.receive_queue)
self.assertEqual(worker.search_requests, self.send_queue)
self.assertEqual(worker.search_results, self.receive_queue)
self.assertEqual(worker.record_batch, [self.fake_record_without_match, self.fake_record_with_match])
def test_match(self):
worker = SearchWorker([self.fake_record_with_match, self.fake_record_without_match], self.send_queue, self.receive_queue)
worker.start()
self.send_queue.put(('search test',))
self.fake_record_with_match.match.called_with(('search test',))
self.fake_record_without_match.match.called_with(('search test',))
self.assertEqual(self.receive_queue.get(), {('search test',): [('fake_data',)]})
self.send_queue.put(None)
self.assertIsNone(self.receive_queue.get())
worker.terminate()
if __name__ == '__main__':
unittest.main()
import unittest
from unittest.mock import MagicMock
from word_frequency import Index, CombinationIndex, Data, count_words
class TestCombinationIndex(unittest.TestCase):
def setUp(self):
self.records = [
('red', 'yellow', 'green', 'black'),
('red', 'green', 'blue', 'black'),
('yellow', 'green', 'blue'),
('yellow', 'blue', 'black')
]
self.queries = [
('blue', 'yellow'),
('black', 'green')
]
self.indexer = CombinationIndex()
def tearDown(self):
self.records = None
self.queries = None
self.indexer = None
def test_index_single_item(self):
self.indexer.index(('red',))
self.assertEqual(sorted(self.indexer.indices.keys()), [('red',)])
def test_index_multiple_items(self):
expected = [
('black',), ('black', 'green'), ('black', 'green', 'red'), ('black', 'green', 'red', 'yellow'),
('black', 'red', 'yellow'), ('black', 'green', 'yellow'),
('black', 'red'), ('black', 'yellow'),
('green',), ('green', 'red'), ('green', 'red', 'yellow'),
('green', 'yellow'),
('red',), ('red', 'yellow'),
('yellow',)
]
self.indexer.index(('red', 'yellow', 'green', 'black'))
self.assertEqual(sorted(self.indexer.indices.keys()), sorted(expected))
def test_search_single_item_without_match(self):
self.indexer.index(('red',))
self.assertEqual(self.indexer.search(('black',)), [])
def test_search_single_item_without_match(self):
self.indexer.index(('red',))
self.assertEqual(self.indexer.search(('black', 'red')), [])
self.assertEqual(self.indexer.search(('black',)), [])
def test_search_single_item_with_match(self):
self.indexer.index(('red',))
self.assertEqual(self.indexer.search(('red',)), [('red',)])
def test_search_multiple_item_without_match(self):
self.indexer.index(('red', 'black'))
self.assertEqual(self.indexer.search(('blue', 'red')), [])
self.assertEqual(self.indexer.search(('red', 'blue')), [])
self.assertEqual(self.indexer.search(('black', 'blue')), [])
def test_search_multiple_item_with_match(self):
record = [('red', 'black', 'yellow', 'green')]
self.indexer.index(record[0])
self.assertEqual(self.indexer.search(('black',)), record)
self.assertEqual(self.indexer.search(('red',)), record)
self.assertEqual(self.indexer.search(('red', 'black')), record)
self.assertEqual(self.indexer.search(('black', 'red')), record)
self.assertEqual(self.indexer.search(('yellow', 'red')), record)
self.assertEqual(self.indexer.search(('red', 'yellow')), record)
self.assertEqual(self.indexer.search(('yellow', 'black')), record)
self.assertEqual(self.indexer.search(('black', 'yellow')), record)
self.assertEqual(self.indexer.search(('black', 'yellow', 'red')), record)
self.assertEqual(self.indexer.search(('black', 'green', 'red')), record)
self.assertEqual(self.indexer.search(('black', 'yellow', 'green')), record)
def test_multiple_indexed_records_with_match(self):
records = [('red', 'black', 'yellow', 'green'), ('red', 'green', 'blue')]
self.indexer.index(records[0])
self.indexer.index(records[1])
self.assertEqual(self.indexer.search(('red', 'black')), [records[0]])
self.assertEqual(self.indexer.search(('red', 'black', 'green')), [records[0]])
self.assertEqual(self.indexer.search(('red', 'green')), records)
self.assertEqual(self.indexer.search(('blue', 'red', 'green')), [records[1]])
self.assertEqual(self.indexer.search(('red', 'green', 'blue')), [records[1]])
self.assertEqual(self.indexer.search(('red', 'green')), records)
class TestData(unittest.TestCase):
def setUp(self):
self.indexer = Index()
self.indexer.index = MagicMock()
self.indexer.search = MagicMock(return_value='found')
self.data = Data(self.indexer)
def tearDown(self):
self.indexer = None
self.data = None
def test_constructor(self):
self.assertEqual(self.data.indexer, self.indexer)
self.assertEqual(self.data.data, [])
def test_index(self):
self.data.index('test_data')
self.indexer.index.assert_called_with('test_data')
self.assertEqual(self.data.data, ['test_data'])
def test_search_with_empty_query(self):
self.data.data = ['fake']
self.assertEqual(self.data.search(()), ['fake'])
self.assertFalse(self.indexer.search.called)
def test_search_with_non_empty_query(self):
self.data.search('test_query')
self.indexer.search.assert_called_with('test_query')
class TestCountWords(unittest.TestCase):
def test_count(self):
expected = {'test': 2, 'hello': 2, 'world': 1, 'bingo': 3, 'no way': 1}
test = [
['test', 'hello', 'world', 'test', 'no way', 'bingo', 'bingo'],
['bingo', 'hello'],
]
self.assertEqual(count_words(test), expected)
self.assertEqual(count_words([[]]), {})
self.assertEqual(count_words([]), {})
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment