catalanojuan/hulu_challenge.py

## hulu_challenge.py
#!/usr/bin/env python

import csv
from collections import defaultdict
from sets import Set
import sys

from json import dumps


def output(dictionary):
  print(dumps(dictionary, sort_keys=True))


def get_queries(f):
  # Taken from: http://stackoverflow.com/questions/260273/most-efficient-way-to-search-the-last-x-lines-of-a-file-in-python
  f.seek (0, 2)
  fsize = f.tell()
  f.seek (max (fsize-1024, 0), 0)
  lines = f.readlines()
  length = len(lines)
  pos = (length - lines.index('####\n')) * -1 + 1
  ret = lines[pos:]
  f.seek(0,0)

  return ret


def count_words_frequency(query, file, queries_dict):
  values = queries_dict.get(query)

def count_words_set_frequency_from_file(filename):
  with open(filename, "r") as file:
    queries = get_queries(file)
    results_dict = defaultdict(lambda : defaultdict(int))

    line = file.readline()
    while line != '####\n':
      # for each line of the file, I build a set with the words
      words_set = Set(line.strip('\n').split(','))

      # and I check every query (smaller set) to see which of the queries is
      # contained in the current line and add the other words counter for that
      # query.
      for query in queries:
        query_set = Set(query.strip('\n').split(','))

        # if the set of words in the query is a subset of the current line
        if query_set.issubset(words_set):
          # calculate the set of every other word
          diff = words_set.difference(query_set)

          # and update the counter
          for word in diff:
            results_dict[query][word] += 1

      line = file.readline()

    # finally print the dictonary for each query
    for query in queries:
      if query in results_dict:
        output(results_dict[query])


if __name__ == '__main__':

  if len(sys.argv) > 2:
    raise Exception('Only one argument supported: filename.')

  count_words_set_frequency_from_file(sys.argv[1])
	#!/usr/bin/env python

	import csv
	from collections import defaultdict
	from sets import Set
	import sys

	from json import dumps


	def output(dictionary):
	print(dumps(dictionary, sort_keys=True))


	def get_queries(f):
	# Taken from: http://stackoverflow.com/questions/260273/most-efficient-way-to-search-the-last-x-lines-of-a-file-in-python
	f.seek (0, 2)
	fsize = f.tell()
	f.seek (max (fsize-1024, 0), 0)
	lines = f.readlines()
	length = len(lines)
	pos = (length - lines.index('####\n')) * -1 + 1
	ret = lines[pos:]
	f.seek(0,0)

	return ret


	def count_words_frequency(query, file, queries_dict):
	values = queries_dict.get(query)

	def count_words_set_frequency_from_file(filename):
	with open(filename, "r") as file:
	queries = get_queries(file)
	results_dict = defaultdict(lambda : defaultdict(int))

	line = file.readline()
	while line != '####\n':
	# for each line of the file, I build a set with the words
	words_set = Set(line.strip('\n').split(','))

	# and I check every query (smaller set) to see which of the queries is
	# contained in the current line and add the other words counter for that
	# query.
	for query in queries:
	query_set = Set(query.strip('\n').split(','))

	# if the set of words in the query is a subset of the current line
	if query_set.issubset(words_set):
	# calculate the set of every other word
	diff = words_set.difference(query_set)

	# and update the counter
	for word in diff:
	results_dict[query][word] += 1

	line = file.readline()

	# finally print the dictonary for each query
	for query in queries:
	if query in results_dict:
	output(results_dict[query])


	if __name__ == '__main__':

	if len(sys.argv) > 2:
	raise Exception('Only one argument supported: filename.')

	count_words_set_frequency_from_file(sys.argv[1])