stefano-maggiolo/count.py

## count.py
#!/usr/bin/env python

import math
import sys

from collections import defaultdict


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "%s wordlist total_count"	% sys.argv[0]
        sys.exit(1)

    total_count = int(sys.argv[2])
    words = defaultdict(int)
    content = [line.strip().split('\t')
               for line in open(sys.argv[1]).readlines()]
    for line in content:
        word = line[0].lower()
        count = int(line[1])
        words[word] += count

    sorted_words = sorted([(words[word], word)
                           for word in words],
                          reverse=True)
    for count, word in sorted_words:
        print "%6.2lf, %10d, %s" % (
            math.log(count * 1.0 / total_count, 10),
            count,
            word)

## extract.py
#!/usr/bin/env python3

import sys

from collections import defaultdict


MIN_OCCURRENCES = 3

content = [x for x in open(sys.argv[1]).read().split()
           if len(x) >= 3]
starts = defaultdict(int)
ends = defaultdict(int)
trigrams = defaultdict(int)
for word in content:
    word = word.lower()
    if len(word) >= 2:
        if word[0].isalpha() and word[1].isalpha():
            starts[word[:2]] += 1
        if word[-2].isalpha() and word[-1].isalpha():
            ends[word[-2:]] += 1
    for i in range(len(word) - 2):
        if word[i].isalpha() and word[i+1].isalpha() and word[i+2].isalpha():
            trigrams[word[i:i+3]] += 1

starts_l = [(x[1], x[0]) for x in starts.items()]
ends_l = [(x[1], x[0]) for x in ends.items()]
trigrams_l = [(x[1], x[0]) for x in trigrams.items()]

ret = []
for score, trigram in trigrams_l:
    if score >= MIN_OCCURRENCES:
        if starts[trigram[:2]] >= MIN_OCCURRENCES:
            if ends[trigram[-2:]] >= MIN_OCCURRENCES:
                ret.append((score,
                            trigram,
                            score,
                            starts[trigram[:2]],
                            ends[trigram[-2:]]))

for x in  sorted(ret, reverse=True):
    print(x[1])

## filter.py
#!/usr/bin/env python3

import sys


if __name__ == "__main__":
    YEARS = ['198', '199', '200', '201']
    for line in sys.stdin:
        idx = 0
        while idx <= 4 and line[idx] != '\t' and line[idx] != '_':
          idx += 1
        if idx != 3:
            continue
        pieces = line.split('\t')
        if pieces[1][:3] not in YEARS:
            continue
        word = line[:3]
        if not word.isalpha():
            continue
        count = pieces[2]
        sys.stdout.write('%s\t%s\n' % (word, count))

## merge.py
#!/usr/bin/env python3

import sys


csv = [x.strip().split(',')
       for x in open(sys.argv[1]).readlines()[2:]]

langs = ['en_us', 'en_gb', 'de', 'es', 'it', 'fr']
for index, lang in enumerate(langs):
    words = [x.strip() for x in open("%s.csv" % lang).readlines()]
    with open("%s_result.csv" % lang, "w") as f:
        num_words = 0
        for line in csv:
            word = line[index + 14].strip()
            if word in words:
                num_words += 1
                f.write(word + "\n")

    print(lang)
    print("%d / %d = %7.2lf\n" % (num_words,
                                  len(words),
                                  num_words / len(words)))
	#!/usr/bin/env python

	import math
	import sys

	from collections import defaultdict


	if __name__ == "__main__":
	if len(sys.argv) != 3:
	print "%s wordlist total_count" % sys.argv[0]
	sys.exit(1)

	total_count = int(sys.argv[2])
	words = defaultdict(int)
	content = [line.strip().split('\t')
	for line in open(sys.argv[1]).readlines()]
	for line in content:
	word = line[0].lower()
	count = int(line[1])
	words[word] += count

	sorted_words = sorted([(words[word], word)
	for word in words],
	reverse=True)
	for count, word in sorted_words:
	print "%6.2lf, %10d, %s" % (
	math.log(count * 1.0 / total_count, 10),
	count,
	word)
	#!/usr/bin/env python3

	import sys

	from collections import defaultdict


	MIN_OCCURRENCES = 3

	content = [x for x in open(sys.argv[1]).read().split()
	if len(x) >= 3]
	starts = defaultdict(int)
	ends = defaultdict(int)
	trigrams = defaultdict(int)
	for word in content:
	word = word.lower()
	if len(word) >= 2:
	if word[0].isalpha() and word[1].isalpha():
	starts[word[:2]] += 1
	if word[-2].isalpha() and word[-1].isalpha():
	ends[word[-2:]] += 1
	for i in range(len(word) - 2):
	if word[i].isalpha() and word[i+1].isalpha() and word[i+2].isalpha():
	trigrams[word[i:i+3]] += 1

	starts_l = [(x[1], x[0]) for x in starts.items()]
	ends_l = [(x[1], x[0]) for x in ends.items()]
	trigrams_l = [(x[1], x[0]) for x in trigrams.items()]

	ret = []
	for score, trigram in trigrams_l:
	if score >= MIN_OCCURRENCES:
	if starts[trigram[:2]] >= MIN_OCCURRENCES:
	if ends[trigram[-2:]] >= MIN_OCCURRENCES:
	ret.append((score,
	trigram,
	score,
	starts[trigram[:2]],
	ends[trigram[-2:]]))

	for x in sorted(ret, reverse=True):
	print(x[1])
	#!/usr/bin/env python3

	import sys


	csv = [x.strip().split(',')
	for x in open(sys.argv[1]).readlines()[2:]]

	langs = ['en_us', 'en_gb', 'de', 'es', 'it', 'fr']
	for index, lang in enumerate(langs):
	words = [x.strip() for x in open("%s.csv" % lang).readlines()]
	with open("%s_result.csv" % lang, "w") as f:
	num_words = 0
	for line in csv:
	word = line[index + 14].strip()
	if word in words:
	num_words += 1
	f.write(word + "\n")

	print(lang)
	print("%d / %d = %7.2lf\n" % (num_words,
	len(words),
	num_words / len(words)))