Last active
August 29, 2015 13:59
-
-
Save catalanojuan/10568610 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
from collections import defaultdict | |
from sets import Set | |
import sys | |
from json import dumps | |
def output(dictionary): | |
print(dumps(dictionary, sort_keys=True)) | |
def get_queries(f): | |
# Taken from: http://stackoverflow.com/questions/260273/most-efficient-way-to-search-the-last-x-lines-of-a-file-in-python | |
f.seek (0, 2) | |
fsize = f.tell() | |
f.seek (max (fsize-1024, 0), 0) | |
lines = f.readlines() | |
length = len(lines) | |
pos = (length - lines.index('####\n')) * -1 + 1 | |
ret = lines[pos:] | |
f.seek(0,0) | |
return ret | |
def count_words_frequency(query, file, queries_dict): | |
values = queries_dict.get(query) | |
def count_words_set_frequency_from_file(filename): | |
with open(filename, "r") as file: | |
queries = get_queries(file) | |
results_dict = defaultdict(lambda : defaultdict(int)) | |
line = file.readline() | |
while line != '####\n': | |
# for each line of the file, I build a set with the words | |
words_set = Set(line.strip('\n').split(',')) | |
# and I check every query (smaller set) to see which of the queries is | |
# contained in the current line and add the other words counter for that | |
# query. | |
for query in queries: | |
query_set = Set(query.strip('\n').split(',')) | |
# if the set of words in the query is a subset of the current line | |
if query_set.issubset(words_set): | |
# calculate the set of every other word | |
diff = words_set.difference(query_set) | |
# and update the counter | |
for word in diff: | |
results_dict[query][word] += 1 | |
line = file.readline() | |
# finally print the dictonary for each query | |
for query in queries: | |
if query in results_dict: | |
output(results_dict[query]) | |
if __name__ == '__main__': | |
if len(sys.argv) > 2: | |
raise Exception('Only one argument supported: filename.') | |
count_words_set_frequency_from_file(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment