# stefano-maggiolo/count.py

Last active August 29, 2015 14:03
 #!/usr/bin/env python import math import sys from collections import defaultdict if __name__ == "__main__": if len(sys.argv) != 3: print "%s wordlist total_count" % sys.argv sys.exit(1) total_count = int(sys.argv) words = defaultdict(int) content = [line.strip().split('\t') for line in open(sys.argv).readlines()] for line in content: word = line.lower() count = int(line) words[word] += count sorted_words = sorted([(words[word], word) for word in words], reverse=True) for count, word in sorted_words: print "%6.2lf, %10d, %s" % ( math.log(count * 1.0 / total_count, 10), count, word)
 #!/usr/bin/env python3 import sys from collections import defaultdict MIN_OCCURRENCES = 3 content = [x for x in open(sys.argv).read().split() if len(x) >= 3] starts = defaultdict(int) ends = defaultdict(int) trigrams = defaultdict(int) for word in content: word = word.lower() if len(word) >= 2: if word.isalpha() and word.isalpha(): starts[word[:2]] += 1 if word[-2].isalpha() and word[-1].isalpha(): ends[word[-2:]] += 1 for i in range(len(word) - 2): if word[i].isalpha() and word[i+1].isalpha() and word[i+2].isalpha(): trigrams[word[i:i+3]] += 1 starts_l = [(x, x) for x in starts.items()] ends_l = [(x, x) for x in ends.items()] trigrams_l = [(x, x) for x in trigrams.items()] ret = [] for score, trigram in trigrams_l: if score >= MIN_OCCURRENCES: if starts[trigram[:2]] >= MIN_OCCURRENCES: if ends[trigram[-2:]] >= MIN_OCCURRENCES: ret.append((score, trigram, score, starts[trigram[:2]], ends[trigram[-2:]])) for x in sorted(ret, reverse=True): print(x)
 #!/usr/bin/env python3 import sys if __name__ == "__main__": YEARS = ['198', '199', '200', '201'] for line in sys.stdin: idx = 0 while idx <= 4 and line[idx] != '\t' and line[idx] != '_': idx += 1 if idx != 3: continue pieces = line.split('\t') if pieces[:3] not in YEARS: continue word = line[:3] if not word.isalpha(): continue count = pieces sys.stdout.write('%s\t%s\n' % (word, count))
 #!/usr/bin/env python3 import sys csv = [x.strip().split(',') for x in open(sys.argv).readlines()[2:]] langs = ['en_us', 'en_gb', 'de', 'es', 'it', 'fr'] for index, lang in enumerate(langs): words = [x.strip() for x in open("%s.csv" % lang).readlines()] with open("%s_result.csv" % lang, "w") as f: num_words = 0 for line in csv: word = line[index + 14].strip() if word in words: num_words += 1 f.write(word + "\n") print(lang) print("%d / %d = %7.2lf\n" % (num_words, len(words), num_words / len(words)))