Skip to content

Instantly share code, notes, and snippets.

@stefano-maggiolo
Last active August 29, 2015 14:03
Show Gist options
  • Save stefano-maggiolo/f8ddab487ab7ba4bd204 to your computer and use it in GitHub Desktop.
Save stefano-maggiolo/f8ddab487ab7ba4bd204 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import math
import sys
from collections import defaultdict
if __name__ == "__main__":
if len(sys.argv) != 3:
print "%s wordlist total_count" % sys.argv[0]
sys.exit(1)
total_count = int(sys.argv[2])
words = defaultdict(int)
content = [line.strip().split('\t')
for line in open(sys.argv[1]).readlines()]
for line in content:
word = line[0].lower()
count = int(line[1])
words[word] += count
sorted_words = sorted([(words[word], word)
for word in words],
reverse=True)
for count, word in sorted_words:
print "%6.2lf, %10d, %s" % (
math.log(count * 1.0 / total_count, 10),
count,
word)
#!/usr/bin/env python3
import sys
from collections import defaultdict
MIN_OCCURRENCES = 3
content = [x for x in open(sys.argv[1]).read().split()
if len(x) >= 3]
starts = defaultdict(int)
ends = defaultdict(int)
trigrams = defaultdict(int)
for word in content:
word = word.lower()
if len(word) >= 2:
if word[0].isalpha() and word[1].isalpha():
starts[word[:2]] += 1
if word[-2].isalpha() and word[-1].isalpha():
ends[word[-2:]] += 1
for i in range(len(word) - 2):
if word[i].isalpha() and word[i+1].isalpha() and word[i+2].isalpha():
trigrams[word[i:i+3]] += 1
starts_l = [(x[1], x[0]) for x in starts.items()]
ends_l = [(x[1], x[0]) for x in ends.items()]
trigrams_l = [(x[1], x[0]) for x in trigrams.items()]
ret = []
for score, trigram in trigrams_l:
if score >= MIN_OCCURRENCES:
if starts[trigram[:2]] >= MIN_OCCURRENCES:
if ends[trigram[-2:]] >= MIN_OCCURRENCES:
ret.append((score,
trigram,
score,
starts[trigram[:2]],
ends[trigram[-2:]]))
for x in sorted(ret, reverse=True):
print(x[1])
#!/usr/bin/env python3
import sys
if __name__ == "__main__":
YEARS = ['198', '199', '200', '201']
for line in sys.stdin:
idx = 0
while idx <= 4 and line[idx] != '\t' and line[idx] != '_':
idx += 1
if idx != 3:
continue
pieces = line.split('\t')
if pieces[1][:3] not in YEARS:
continue
word = line[:3]
if not word.isalpha():
continue
count = pieces[2]
sys.stdout.write('%s\t%s\n' % (word, count))
#!/usr/bin/env python3
import sys
csv = [x.strip().split(',')
for x in open(sys.argv[1]).readlines()[2:]]
langs = ['en_us', 'en_gb', 'de', 'es', 'it', 'fr']
for index, lang in enumerate(langs):
words = [x.strip() for x in open("%s.csv" % lang).readlines()]
with open("%s_result.csv" % lang, "w") as f:
num_words = 0
for line in csv:
word = line[index + 14].strip()
if word in words:
num_words += 1
f.write(word + "\n")
print(lang)
print("%d / %d = %7.2lf\n" % (num_words,
len(words),
num_words / len(words)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment