Skip to content

Instantly share code, notes, and snippets.

@gugray
Created April 16, 2017 19:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gugray/a8290d62b8b2e39eaa535457c167b3ff to your computer and use it in GitHub Desktop.
Save gugray/a8290d62b8b2e39eaa535457c167b3ff to your computer and use it in GitHub Desktop.
import matplotlib.pyplot as plt
import numpy as np
import string
import csv
import sys
import math
import re
csv.field_size_limit(sys.maxsize)
words = {}
matcher = re.compile("[^ ]+ ([^ ]+)[^\]]+\] (.+)")
with open("chdict.u8", "r", encoding="utf8") as f:
for line in f:
if (line.startswith("#")): continue
if (len(line) == 0): continue
m = matcher.match(line)
if m == None: continue
word = m.group(1)
senses = m.group(2)
slashcount = senses.count('/')
semiccont = senses.count(';')
if word not in words: words[word] = 0
sensecount = slashcount - 1
words[word] += sensecount
wdtorank = {}
with open("subtlex.txt", "r", encoding="utf8") as f:
reader = csv.reader(f, delimiter='\t')
next(reader) # Skip header
rank = 0
for inrow in reader:
wdtorank[inrow[0]] = rank
rank += 1
buckets = []
for i in range(0, 100):
bucket = { 'ix': i, 'words': 0, 'senses': 0 }
buckets.append(bucket)
for word in words:
if word not in wdtorank: continue
rank = wdtorank[word]
buckix = int(math.floor(rank / 1000))
buckets[buckix]['words'] += 1
buckets[buckix]['senses'] += words[word]
xarr = np.arange(len(buckets))
yarr1 = []
yarr2 = []
for i in xarr:
words = buckets[i]['words']
yarr1.append(words)
if words > 0: yarr2.append(buckets[i]['senses'] /words)
else: yarr2.append(1)
plt.plot(xarr, yarr1, 'b')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment