stefano-maggiolo/count.py

## 42 changes: 42 additions & 0 deletions extract.py
@@ -0,0 +1,42 @@

    #!/usr/bin/env python3
#!/usr/bin/env python3


    import sys
import sys


    from collections import defaultdict
from collections import defaultdict


    MIN_OCCURRENCES = 3
MIN_OCCURRENCES = 3


    content = [x for x in open(sys.argv[1]).read().split()
content = [x for x in open(sys.argv[1]).read().split()

               if len(x) >= 3]
           if len(x) >= 3]

    starts = defaultdict(int)
starts = defaultdict(int)

    ends = defaultdict(int)
ends = defaultdict(int)

    trigrams = defaultdict(int)
trigrams = defaultdict(int)

    for word in content:
for word in content:

        word = word.lower()
    word = word.lower()

        if len(word) >= 2:
    if len(word) >= 2:

            if word[0].isalpha() and word[1].isalpha():
        if word[0].isalpha() and word[1].isalpha():

                starts[word[:2]] += 1
            starts[word[:2]] += 1

            if word[-2].isalpha() and word[-1].isalpha():
        if word[-2].isalpha() and word[-1].isalpha():

                ends[word[-2:]] += 1
            ends[word[-2:]] += 1

        for i in range(len(word) - 2):
    for i in range(len(word) - 2):

            if word[i].isalpha() and word[i+1].isalpha() and word[i+2].isalpha():
        if word[i].isalpha() and word[i+1].isalpha() and word[i+2].isalpha():

                trigrams[word[i:i+3]] += 1
            trigrams[word[i:i+3]] += 1


    starts_l = [(x[1], x[0]) for x in starts.items()]
starts_l = [(x[1], x[0]) for x in starts.items()]

    ends_l = [(x[1], x[0]) for x in ends.items()]
ends_l = [(x[1], x[0]) for x in ends.items()]

    trigrams_l = [(x[1], x[0]) for x in trigrams.items()]
trigrams_l = [(x[1], x[0]) for x in trigrams.items()]


    ret = []
ret = []

    for score, trigram in trigrams_l:
for score, trigram in trigrams_l:

        if score >= MIN_OCCURRENCES:
    if score >= MIN_OCCURRENCES:

            if starts[trigram[:2]] >= MIN_OCCURRENCES:
        if starts[trigram[:2]] >= MIN_OCCURRENCES:

                if ends[trigram[-2:]] >= MIN_OCCURRENCES:
            if ends[trigram[-2:]] >= MIN_OCCURRENCES:

                    ret.append((score,
                ret.append((score,

                                trigram,
                            trigram,

                                score,
                            score,

                                starts[trigram[:2]],
                            starts[trigram[:2]],

                                ends[trigram[-2:]]))
                            ends[trigram[-2:]]))


    for x in  sorted(ret, reverse=True):
for x in  sorted(ret, reverse=True):

        print(x[1])
    print(x[1])

## 23 changes: 23 additions & 0 deletions merge.py
@@ -0,0 +1,23 @@

    #!/usr/bin/env python3
#!/usr/bin/env python3


    import sys
import sys


    csv = [x.strip().split(',')
csv = [x.strip().split(',')

           for x in open(sys.argv[1]).readlines()[2:]]
       for x in open(sys.argv[1]).readlines()[2:]]


    langs = ['en_us', 'en_gb', 'de', 'es', 'it', 'fr']
langs = ['en_us', 'en_gb', 'de', 'es', 'it', 'fr']

    for index, lang in enumerate(langs):
for index, lang in enumerate(langs):

        words = [x.strip() for x in open("%s.csv" % lang).readlines()]
    words = [x.strip() for x in open("%s.csv" % lang).readlines()]

        with open("%s_result.csv" % lang, "w") as f:
    with open("%s_result.csv" % lang, "w") as f:

            num_words = 0
        num_words = 0

            for line in csv:
        for line in csv:

                word = line[index + 14].strip()
            word = line[index + 14].strip()

                if word in words:
            if word in words:

                    num_words += 1
                num_words += 1

                    f.write(word + "\n")
                f.write(word + "\n")


        print(lang)
    print(lang)

        print("%d / %d = %7.2lf\n" % (num_words,
    print("%d / %d = %7.2lf\n" % (num_words,

                                      len(words),
                                  len(words),

                                      num_words / len(words)))
                                  num_words / len(words)))

## 30 changes: 30 additions & 0 deletions count.py
@@ -0,0 +1,30 @@

    #!/usr/bin/env python
#!/usr/bin/env python


    import math
import math

    import sys
import sys


    from collections import defaultdict
from collections import defaultdict


    if __name__ == "__main__":
if __name__ == "__main__":

        if len(sys.argv) != 3:
    if len(sys.argv) != 3:

            print "%s wordlist total_count"	% sys.argv[0]
        print "%s wordlist total_count"	% sys.argv[0]

            sys.exit(1)
        sys.exit(1)


        total_count = int(sys.argv[2])
    total_count = int(sys.argv[2])

        words = defaultdict(int)
    words = defaultdict(int)

        content = [line.strip().split('\t')
    content = [line.strip().split('\t')

                   for line in open(sys.argv[1]).readlines()]
               for line in open(sys.argv[1]).readlines()]

        for line in content:
    for line in content:

            word = line[0].lower()
        word = line[0].lower()

            count = int(line[1])
        count = int(line[1])

            words[word] += count
        words[word] += count


        sorted_words = sorted([(words[word], word)
    sorted_words = sorted([(words[word], word)

                               for word in words],
                           for word in words],

                              reverse=True)
                          reverse=True)

        for count, word in sorted_words:
    for count, word in sorted_words:

            print "%6.2lf, %10d, %s" % (
        print "%6.2lf, %10d, %s" % (

                math.log(count * 1.0 / total_count, 10),
            math.log(count * 1.0 / total_count, 10),

                count,
            count,

                word)
            word)

## 21 changes: 21 additions & 0 deletions filter.py
@@ -0,0 +1,21 @@

    #!/usr/bin/env python3
#!/usr/bin/env python3


    import sys
import sys


    if __name__ == "__main__":
if __name__ == "__main__":

        YEARS = ['198', '199', '200', '201']
    YEARS = ['198', '199', '200', '201']

        for line in sys.stdin:
    for line in sys.stdin:

            idx = 0
        idx = 0

            while idx <= 4 and line[idx] != '\t' and line[idx] != '_':
        while idx <= 4 and line[idx] != '\t' and line[idx] != '_':

              idx += 1
          idx += 1

            if idx != 3:
        if idx != 3:

                continue
            continue

            pieces = line.split('\t')
        pieces = line.split('\t')

            if pieces[1][:3] not in YEARS:
        if pieces[1][:3] not in YEARS:

                continue
            continue

            word = line[:3]
        word = line[:3]

            if not word.isalpha():
        if not word.isalpha():

                continue
            continue

            count = pieces[2]
        count = pieces[2]

            sys.stdout.write('%s\t%s\n' % (word, count))
        sys.stdout.write('%s\t%s\n' % (word, count))