Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Count unigrams and bigrams in the Wolfart-Ahenakew corpus

Cree Grapheme Stats

Count unigrams and bigrams in the Wolfart-Ahenakew nêhiyawêwin corpus!

Why

When building a keyboard for typing Cree, it is useful to know which graphemes are typed often, and which pairs of graphemes are typed one after the other. Using unigram statistics, we can place the most frequent graphemes in the most ergonomic "neutral" positions. To speed up typing, we place frequently typed pairs on opposite sides of the keyboard, optimizing for two-handed typing with to maximize alternating hands/thumbs.

Requirements

  • Python 3.6+
  • sponge(1) from moreutils (brew install moreutils)
  • nfc(1) from unormalize (brew install eddieantonio/eddieantonio/unormalize)

Files

.
├── Makefile
├── bigrams.pdf         [output]
├── bigrams.tsv         [output]
├── cleancorp.txt       [input]
├── count-bigrams
├── count-unigrams
├── create-fdp
├── defuse
├── filter-out-non-sro
├── tokenize
└── unigrams.tsv        [output]

Note: You must download cleancorp.txt separately!

SHA256 sum:

2cb09a54b9c3cc329eba573b4798aa2d79b19f58b4f417a97150c9133c3b3343  cleancorp.txt
bigram count
wa 9125
êk 8080
ni 8076
kw 7655
ik 6771
ta 6705
ân 6609
is 6479
ko 6260
ê- 5660
5644
hk 5414
ki 5077
na 4990
ak 4929
it 4480
iy 4446
aw 4179
si 4067
mi 4014
3988
-k 3962
ka 3885
ay 3778
ah 3755
ih 3730
sk 3629
ya 3536
ht 3487
yi 3438
î- 3406
an 3383
ma 3363
ci 3242
3214
im 3080
2984
2940
in 2743
am 2727
pi 2700
wi 2697
êw 2645
2630
os 2555
2487
2486
ti 2406
to 2370
ot 2266
2192
oh 2186
âw 2174
âh 2174
ât 2126
on 1904
êy 1901
âk 1901
âs 1871
hi 1797
-p 1764
êt 1759
st 1735
1720
ôm 1690
âp 1677
1675
pa 1650
1583
hc 1571
1558
-m 1534
ây 1492
iw 1456
as 1437
îs 1432
âc 1423
ôy 1420
1339
â- 1330
în 1285
ês 1278
sa 1278
i- 1271
îh 1258
-n 1251
-w 1196
-i 1194
ôh 1147
1034
at 993
ôt 925
a- 924
ip 852
-a 827
êh 740
ow 675
665
656
k- 654
âm 646
îk 615
tw 614
sp 609
ic 600
h- 593
mo 589
589
mm 575
558
so 547
537
ôs 525
hp 522
ôk 512
-t 511
-o 505
no 440
410
oy 404
ha 401
ên 400
398
ac 397
393
391
ît 390
pw 386
384
ap 381
371
îc 368
ok 364
344
îw 321
oc 317
êp 313
-s 304
298
ôc 292
291
mw 265
êc 262
257
ôn 250
yw 246
246
230
ho 222
êm 221
214
212
nw 210
om 209
po 188
w- 181
îm 179
îy 172
c- 170
170
sw 170
o- 168
îp 168
op 159
yo 150
145
sc 137
ca 134
y- 118
t- 106
s- 94
89
84
hy 78
sm 56
hw 55
-c 55
co 53
th 47
47
wo 44
wm 44
-y 33
m- 28
my 25
ll 20
nm 19
18
tm 17
ym 17
al 16
n- 16
or 15
ôp 14
wh 14
la 13
il 13
hm 13
ai 12
kn 12
12
ôw 11
ô- 10
p- 9
ra 9
ar 8
cw 8
ny 7
ch 7
nt 7
7
tr 7
km 7
rt 6
lo 6
ri 5
ôl 5
êê 5
ck 4
nc 4
ol 4
ts 4
oo 4
pl 3
ir 3
ns 3
tt 2
nl 2
ly 2
ry 2
2
ys 2
wn 2
ty 2
nk 2
-h 2
hn 2
yr 2
2
lm 2
-r 1
1
ââ 1
cc 1
pm 1
sn 1
ss 1
cm 1
rc 1
ps 1
êr 1
rk 1
ks 1
sl 1
ls 1
ms 1
wr 1
tc 1
mp 1
tk 1
1
ws 1
li 1
1
ky 1
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import fileinput
from collections import Counter
bigrams = Counter()
for line in fileinput.input():
line = line.strip()
if len(line) <= 1:
continue
for a,b in zip(line, line[1:]):
bigrams[a + b] += 1
print('bigram', 'count', sep='\t')
for bigram, count in bigrams.most_common():
print(bigram, count, sep='\t')
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import fileinput
from collections import Counter
unigrams = Counter()
for line in fileinput.input():
line = line.strip()
if len(line) < 1:
continue
unigrams.update(Counter(line))
print('unigram', 'count', sep='\t')
for unigram, count in unigrams.most_common():
print(unigram, count, sep='\t')
#!/usr/bin/env python3
# -*- encoding: UTF-8 -*-
import fileinput
from math import log, sqrt
MAX_PENWIDTH = 6
print("graph {")
print("""
node [shape=circle, splines=curved, fontsize=8];
edge [len=49];
""")
max_count = None
# Skip header
lines = fileinput.input()
next(lines)
for line in lines:
assert 'count' not in line
bigram, count = line.strip().split()
c1, c2 = bigram
count = int(count)
if max_count is None:
max_count = count
assert count <= max_count
# Remove self-cycles; not useful here.
if c1 == c2:
print(f" // ignoring {bigram}")
continue
weight = max_count + 1 - count
factor = count / max_count
logfactor = log(1 + factor)
penwidth = MAX_PENWIDTH * factor
alpha = int(255 * logfactor)
assert 0 <= alpha <= 255
print(f' "{c1}" -- "{c2}" ['
f"weight={weight},"
f"penwidth={penwidth:.4f},"
f'color="#000000{alpha:02X}"'
f"]; // {factor}")
print("}")
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Removes the BOM from a stdin.
"""
import sys
BYTE_ORDER_MARK = '\uFEFF'.encode('UTF-8')
data = sys.stdin.buffer.read()
print(data.replace(BYTE_ORDER_MARK, b'').decode('UTF-8'), end='')
#!/bin/sh
env LC_ALL=C.UTF-8 grep -o -E '^[ptkcsmnhywêiîoôaâlr-]+$' | grep -v '^---*$'
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import sys
TYPE = 0
WEIGHT = 1
LEFT = 2
SYMBOL = 2
RIGHT = 3
unigrams = []
# Skip the header
lines = iter(sys.stdin)
next(lines)
for line in lines:
symbol, count = line.strip().split()
unigrams.append((symbol, int(count)))
def by_weight(t):
return t[WEIGHT]
# Initially, the tree is comprised entirely of leaves.
huffman_tree = [('leaf', weight, symbol) for symbol, weight in unigrams]
# Reduce the tree by glomming on the lowest probability items iteratively.
while True:
# Invariant: the tree is always sorted:
huffman_tree.sort(key=by_weight, reverse=True)
if len(huffman_tree) < 2:
break
# Group the items with the lowest probability.
right = huffman_tree.pop()
left = huffman_tree.pop()
weight = by_weight(left) + by_weight(right)
huffman_tree.append(('internal', weight, left, right))
# Now, traverse the tree to assign binary codes.
def assign_codes(root):
return _assign_codes(root, ())
def _assign_codes(node, code):
if node[TYPE] == 'internal':
assert node[LEFT][WEIGHT] > node[RIGHT][WEIGHT]
left = _assign_codes(node[LEFT], code + (1,))
right = _assign_codes(node[RIGHT], code + (0,))
return {**left, **right}
else:
assert node[TYPE] == 'leaf'
return {node[SYMBOL]: code}
coding = assign_codes(huffman_tree[0])
table = sorted(coding.items(), key=lambda x: len(x[1]))
print('symbol', 'code', sep="\t")
for symbol, code in table:
bits = ''.join(str(bit) for bit in code)
print(symbol, bits, sep="\t")
symbol code
k 100
a 010
i 001
ê 1100
w 1011
â 1010
n 0111
t 0110
m 0001
s 11111
h 11110
- 11101
y 11100
o 11010
î 00001
p 00000
ô 110111
c 1101101
l 11011001
r 11011000
# Requires nfc(1) from unormalize.
# Requires sponge(1) from moreutils
CORPUS = AWCleanCorp-FalseStarts.txt
all: bigrams.tsv unigrams.tsv wordlist.tsv huffman-coding.tsv
unigrams.tsv: tokens.txt count-unigrams
<$< ./count-unigrams | sponge $@
bigrams.tsv: tokens.txt count-bigrams
<$< ./count-bigrams | sponge $@
tokens.txt: $(CORPUS) defuse tokenize filter-out-non-sro
<$< ./defuse | nfc | ./tokenize | ./filter-out-non-sro | sponge $@
bigrams.fdp.dot: bigrams.tsv create-fdp
<$< ./create-fdp | sponge $@
wordlist.tsv: tokens.txt
sort $< | uniq -c | sort -rn | awk '{ print $$2 "\t" $$1 }' | sponge $@
huffman-coding.tsv: unigrams.tsv huff
<$< ./huff | sponge $@
%.pdf: %.fdp.dot
fdp -Tpdf $< | sponge $@
syllabic count
<NNBSP> 13282
13223
11876
8515
8033
6698
6250
6194
5455
5165
5103
5065
4979
4604
4126
4065
4000
3989
3962
3573
3561
3470
3466
3351
3345
3306
3286
3261
3238
3044
2867
2683
2642
2520
2505
2501
k 2356
2347
2315
2196
w 1892
1830
1739
1722
1677
1645
1585
1581
1558
1419
1343
a 1335
1275
1135
m 1093
ê 996
812
737
666
655
602
- 598
â 595
577
558
547
534
486
425
422
i 421
415
410
401
397
391
343
338
318
292
t 258
257
248
220
216
p 214
197
188
169
167
155
n 153
s 146
145
135
131
h 126
96
o 89
75
y 70
67
63
61
l 57
55
53
52
45
35
c 32
29
27
24
21
r 20
20
20
18
17
14
ô 14
13
î 12
9
8
6
5
5
4
2
1
1
1
#!/bin/sh
tr ' ' '\n'
unigram count
k 42869
a 40310
i 39944
ê 25103
w 24164
â 23968
n 22148
t 20202
m 19537
s 16400
h 15083
- 14740
y 13789
o 12255
î 9981
p 8187
ô 7118
c 5733
l 77
r 40
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.