Skip to content

Instantly share code, notes, and snippets.

@catalanojuan
Last active August 29, 2015 13:59
Show Gist options
  • Save catalanojuan/10568610 to your computer and use it in GitHub Desktop.
Save catalanojuan/10568610 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import csv
from collections import defaultdict
from sets import Set
import sys
from json import dumps
def output(dictionary):
print(dumps(dictionary, sort_keys=True))
def get_queries(f):
# Taken from: http://stackoverflow.com/questions/260273/most-efficient-way-to-search-the-last-x-lines-of-a-file-in-python
f.seek (0, 2)
fsize = f.tell()
f.seek (max (fsize-1024, 0), 0)
lines = f.readlines()
length = len(lines)
pos = (length - lines.index('####\n')) * -1 + 1
ret = lines[pos:]
f.seek(0,0)
return ret
def count_words_frequency(query, file, queries_dict):
values = queries_dict.get(query)
def count_words_set_frequency_from_file(filename):
with open(filename, "r") as file:
queries = get_queries(file)
results_dict = defaultdict(lambda : defaultdict(int))
line = file.readline()
while line != '####\n':
# for each line of the file, I build a set with the words
words_set = Set(line.strip('\n').split(','))
# and I check every query (smaller set) to see which of the queries is
# contained in the current line and add the other words counter for that
# query.
for query in queries:
query_set = Set(query.strip('\n').split(','))
# if the set of words in the query is a subset of the current line
if query_set.issubset(words_set):
# calculate the set of every other word
diff = words_set.difference(query_set)
# and update the counter
for word in diff:
results_dict[query][word] += 1
line = file.readline()
# finally print the dictonary for each query
for query in queries:
if query in results_dict:
output(results_dict[query])
if __name__ == '__main__':
if len(sys.argv) > 2:
raise Exception('Only one argument supported: filename.')
count_words_set_frequency_from_file(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment