#!/usr/bin/env python | |
import math | |
import sys | |
from collections import defaultdict | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print "%s wordlist total_count" % sys.argv[0] | |
sys.exit(1) | |
total_count = int(sys.argv[2]) | |
words = defaultdict(int) | |
content = [line.strip().split('\t') | |
for line in open(sys.argv[1]).readlines()] | |
for line in content: | |
word = line[0].lower() | |
count = int(line[1]) | |
words[word] += count | |
sorted_words = sorted([(words[word], word) | |
for word in words], | |
reverse=True) | |
for count, word in sorted_words: | |
print "%6.2lf, %10d, %s" % ( | |
math.log(count * 1.0 / total_count, 10), | |
count, | |
word) |
#!/usr/bin/env python3 | |
import sys | |
from collections import defaultdict | |
MIN_OCCURRENCES = 3 | |
content = [x for x in open(sys.argv[1]).read().split() | |
if len(x) >= 3] | |
starts = defaultdict(int) | |
ends = defaultdict(int) | |
trigrams = defaultdict(int) | |
for word in content: | |
word = word.lower() | |
if len(word) >= 2: | |
if word[0].isalpha() and word[1].isalpha(): | |
starts[word[:2]] += 1 | |
if word[-2].isalpha() and word[-1].isalpha(): | |
ends[word[-2:]] += 1 | |
for i in range(len(word) - 2): | |
if word[i].isalpha() and word[i+1].isalpha() and word[i+2].isalpha(): | |
trigrams[word[i:i+3]] += 1 | |
starts_l = [(x[1], x[0]) for x in starts.items()] | |
ends_l = [(x[1], x[0]) for x in ends.items()] | |
trigrams_l = [(x[1], x[0]) for x in trigrams.items()] | |
ret = [] | |
for score, trigram in trigrams_l: | |
if score >= MIN_OCCURRENCES: | |
if starts[trigram[:2]] >= MIN_OCCURRENCES: | |
if ends[trigram[-2:]] >= MIN_OCCURRENCES: | |
ret.append((score, | |
trigram, | |
score, | |
starts[trigram[:2]], | |
ends[trigram[-2:]])) | |
for x in sorted(ret, reverse=True): | |
print(x[1]) |
#!/usr/bin/env python3 | |
import sys | |
if __name__ == "__main__": | |
YEARS = ['198', '199', '200', '201'] | |
for line in sys.stdin: | |
idx = 0 | |
while idx <= 4 and line[idx] != '\t' and line[idx] != '_': | |
idx += 1 | |
if idx != 3: | |
continue | |
pieces = line.split('\t') | |
if pieces[1][:3] not in YEARS: | |
continue | |
word = line[:3] | |
if not word.isalpha(): | |
continue | |
count = pieces[2] | |
sys.stdout.write('%s\t%s\n' % (word, count)) |
#!/usr/bin/env python3 | |
import sys | |
csv = [x.strip().split(',') | |
for x in open(sys.argv[1]).readlines()[2:]] | |
langs = ['en_us', 'en_gb', 'de', 'es', 'it', 'fr'] | |
for index, lang in enumerate(langs): | |
words = [x.strip() for x in open("%s.csv" % lang).readlines()] | |
with open("%s_result.csv" % lang, "w") as f: | |
num_words = 0 | |
for line in csv: | |
word = line[index + 14].strip() | |
if word in words: | |
num_words += 1 | |
f.write(word + "\n") | |
print(lang) | |
print("%d / %d = %7.2lf\n" % (num_words, | |
len(words), | |
num_words / len(words))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment