Skip to content

Instantly share code, notes, and snippets.

@stefano-maggiolo
Last active August 29, 2015 14:03

Revisions

  1. stefano-maggiolo revised this gist Aug 25, 2014. 2 changed files with 65 additions and 0 deletions.
    42 changes: 42 additions & 0 deletions extract.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,42 @@
    #!/usr/bin/env python3

    import sys

    from collections import defaultdict


    MIN_OCCURRENCES = 3

    content = [x for x in open(sys.argv[1]).read().split()
    if len(x) >= 3]
    starts = defaultdict(int)
    ends = defaultdict(int)
    trigrams = defaultdict(int)
    for word in content:
    word = word.lower()
    if len(word) >= 2:
    if word[0].isalpha() and word[1].isalpha():
    starts[word[:2]] += 1
    if word[-2].isalpha() and word[-1].isalpha():
    ends[word[-2:]] += 1
    for i in range(len(word) - 2):
    if word[i].isalpha() and word[i+1].isalpha() and word[i+2].isalpha():
    trigrams[word[i:i+3]] += 1

    starts_l = [(x[1], x[0]) for x in starts.items()]
    ends_l = [(x[1], x[0]) for x in ends.items()]
    trigrams_l = [(x[1], x[0]) for x in trigrams.items()]

    ret = []
    for score, trigram in trigrams_l:
    if score >= MIN_OCCURRENCES:
    if starts[trigram[:2]] >= MIN_OCCURRENCES:
    if ends[trigram[-2:]] >= MIN_OCCURRENCES:
    ret.append((score,
    trigram,
    score,
    starts[trigram[:2]],
    ends[trigram[-2:]]))

    for x in sorted(ret, reverse=True):
    print(x[1])
    23 changes: 23 additions & 0 deletions merge.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,23 @@
    #!/usr/bin/env python3

    import sys


    csv = [x.strip().split(',')
    for x in open(sys.argv[1]).readlines()[2:]]

    langs = ['en_us', 'en_gb', 'de', 'es', 'it', 'fr']
    for index, lang in enumerate(langs):
    words = [x.strip() for x in open("%s.csv" % lang).readlines()]
    with open("%s_result.csv" % lang, "w") as f:
    num_words = 0
    for line in csv:
    word = line[index + 14].strip()
    if word in words:
    num_words += 1
    f.write(word + "\n")

    print(lang)
    print("%d / %d = %7.2lf\n" % (num_words,
    len(words),
    num_words / len(words)))
  2. stefano-maggiolo created this gist Jul 5, 2014.
    30 changes: 30 additions & 0 deletions count.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,30 @@
    #!/usr/bin/env python

    import math
    import sys

    from collections import defaultdict


    if __name__ == "__main__":
    if len(sys.argv) != 3:
    print "%s wordlist total_count" % sys.argv[0]
    sys.exit(1)

    total_count = int(sys.argv[2])
    words = defaultdict(int)
    content = [line.strip().split('\t')
    for line in open(sys.argv[1]).readlines()]
    for line in content:
    word = line[0].lower()
    count = int(line[1])
    words[word] += count

    sorted_words = sorted([(words[word], word)
    for word in words],
    reverse=True)
    for count, word in sorted_words:
    print "%6.2lf, %10d, %s" % (
    math.log(count * 1.0 / total_count, 10),
    count,
    word)
    21 changes: 21 additions & 0 deletions filter.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,21 @@
    #!/usr/bin/env python3

    import sys


    if __name__ == "__main__":
    YEARS = ['198', '199', '200', '201']
    for line in sys.stdin:
    idx = 0
    while idx <= 4 and line[idx] != '\t' and line[idx] != '_':
    idx += 1
    if idx != 3:
    continue
    pieces = line.split('\t')
    if pieces[1][:3] not in YEARS:
    continue
    word = line[:3]
    if not word.isalpha():
    continue
    count = pieces[2]
    sys.stdout.write('%s\t%s\n' % (word, count))