gugray/plot-freq-coverage.py Secret

## plot-freq-coverage.py
import matplotlib.pyplot as plt
import numpy as np
import string
import csv
import sys
import math
import re

csv.field_size_limit(sys.maxsize)

words = {}

matcher = re.compile("[^ ]+ ([^ ]+)[^\]]+\] (.+)")
with open("chdict.u8", "r", encoding="utf8") as f:
  for line in f:
    if (line.startswith("#")): continue
    if (len(line) == 0): continue
    m = matcher.match(line)
    if m == None: continue
    word = m.group(1)
    senses = m.group(2)
    slashcount = senses.count('/')
    semiccont = senses.count(';')
    if word not in words: words[word] = 0
    sensecount = slashcount - 1
    words[word] += sensecount

wdtorank = {}

with open("subtlex.txt", "r", encoding="utf8") as f:
    reader = csv.reader(f, delimiter='\t')
    next(reader) # Skip header
    rank = 0
    for inrow in reader:
      wdtorank[inrow[0]] = rank
      rank += 1

buckets = []

for i in range(0, 100):
    bucket = { 'ix': i, 'words': 0, 'senses': 0 }
    buckets.append(bucket)

for word in words:
  if word not in wdtorank: continue
  rank = wdtorank[word]
  buckix = int(math.floor(rank / 1000))
  buckets[buckix]['words'] += 1
  buckets[buckix]['senses'] += words[word]

xarr = np.arange(len(buckets))
yarr1 = []
yarr2 = []
for i in xarr:
  words = buckets[i]['words']
  yarr1.append(words)
  if words > 0: yarr2.append(buckets[i]['senses'] /words)
  else: yarr2.append(1)
plt.plot(xarr, yarr1, 'b')
plt.show()
	import matplotlib.pyplot as plt
	import numpy as np
	import string
	import csv
	import sys
	import math
	import re

	csv.field_size_limit(sys.maxsize)

	words = {}

	matcher = re.compile("[^ ]+ ([^ ]+)[^\]]+\] (.+)")
	with open("chdict.u8", "r", encoding="utf8") as f:
	for line in f:
	if (line.startswith("#")): continue
	if (len(line) == 0): continue
	m = matcher.match(line)
	if m == None: continue
	word = m.group(1)
	senses = m.group(2)
	slashcount = senses.count('/')
	semiccont = senses.count(';')
	if word not in words: words[word] = 0
	sensecount = slashcount - 1
	words[word] += sensecount

	wdtorank = {}

	with open("subtlex.txt", "r", encoding="utf8") as f:
	reader = csv.reader(f, delimiter='\t')
	next(reader) # Skip header
	rank = 0
	for inrow in reader:
	wdtorank[inrow[0]] = rank
	rank += 1

	buckets = []

	for i in range(0, 100):
	bucket = { 'ix': i, 'words': 0, 'senses': 0 }
	buckets.append(bucket)

	for word in words:
	if word not in wdtorank: continue
	rank = wdtorank[word]
	buckix = int(math.floor(rank / 1000))
	buckets[buckix]['words'] += 1
	buckets[buckix]['senses'] += words[word]

	xarr = np.arange(len(buckets))
	yarr1 = []
	yarr2 = []
	for i in xarr:
	words = buckets[i]['words']
	yarr1.append(words)
	if words > 0: yarr2.append(buckets[i]['senses'] /words)
	else: yarr2.append(1)
	plt.plot(xarr, yarr1, 'b')
	plt.show()