Instantly share code, notes, and snippets.

Embed
What would you like to do?
import matplotlib.pyplot as plt
import numpy as np
import string
import csv
import sys
import math
import re
csv.field_size_limit(sys.maxsize)
words = {}
matcher = re.compile("[^ ]+ ([^ ]+)[^\]]+\] (.+)")
with open("chdict.u8", "r", encoding="utf8") as f:
for line in f:
if (line.startswith("#")): continue
if (len(line) == 0): continue
m = matcher.match(line)
if m == None: continue
word = m.group(1)
senses = m.group(2)
slashcount = senses.count('/')
semiccont = senses.count(';')
if word not in words: words[word] = 0
sensecount = slashcount - 1
words[word] += sensecount
wdtorank = {}
with open("subtlex.txt", "r", encoding="utf8") as f:
reader = csv.reader(f, delimiter='\t')
next(reader) # Skip header
rank = 0
for inrow in reader:
wdtorank[inrow[0]] = rank
rank += 1
buckets = []
for i in range(0, 100):
bucket = { 'ix': i, 'words': 0, 'senses': 0 }
buckets.append(bucket)
for word in words:
if word not in wdtorank: continue
rank = wdtorank[word]
buckix = int(math.floor(rank / 1000))
buckets[buckix]['words'] += 1
buckets[buckix]['senses'] += words[word]
xarr = np.arange(len(buckets))
yarr1 = []
yarr2 = []
for i in xarr:
words = buckets[i]['words']
yarr1.append(words)
if words > 0: yarr2.append(buckets[i]['senses'] /words)
else: yarr2.append(1)
plt.plot(xarr, yarr1, 'b')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment