Created
May 22, 2011 02:28
-
-
Save zengargoyle/985115 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Compute digram frequency of <space>a-zA-Z | |
<space> is signified by 'S', A-Z are mapped onto a-z. | |
Output: Char1Char2 Count Count/Total | |
... | |
Sf 103 0.006550 | |
... | |
eo 16 0.001018 | |
... | |
15724 characters # on STDERR | |
Usage: digram < corpus.txt | |
E.g.: | |
SS 339 0.021559 | |
Sa 300 0.019079 | |
Sb 161 0.010239 | |
Sc 88 0.005597 | |
Sd 85 0.005406 | |
Se 81 0.005151 | |
Sf 103 0.006550 | |
Sg 100 0.006360 | |
Sh 121 0.007695 | |
Si 242 0.015390 | |
Sj 21 0.001336 | |
Sk 14 0.000890 | |
Sl 61 0.003879 | |
Sm 115 0.007314 | |
Sn 44 0.002798 | |
So 204 0.012974 | |
Sp 99 0.006296 | |
Sq 5 0.000318 | |
Sr 57 0.003625 | |
Ss 206 0.013101 | |
St 475 0.030209 | |
Su 29 0.001844 | |
Sv 20 0.001272 | |
Sw 143 0.009094 | |
Sy 59 0.003752 | |
Sz 3 0.000191 | |
aS 102 0.006487 | |
ab 24 0.001526 | |
ac 36 0.002289 | |
ad 29 0.001844 | |
af 6 0.000382 | |
ag 19 0.001208 | |
ai 16 0.001018 | |
aj 1 0.000064 | |
ak 18 0.001145 | |
al 126 0.008013 | |
am 13 0.000827 | |
an 169 0.010748 | |
ap 8 0.000509 | |
aq 1 0.000064 | |
ar 91 0.005787 | |
as 63 0.004007 | |
at 152 0.009667 | |
au 14 0.000890 | |
av 27 0.001717 | |
aw 2 0.000127 | |
ay 22 0.001399 | |
az 6 0.000382 | |
bS 2 0.000127 | |
ba 31 0.001972 | |
bb 1 0.000064 | |
bc 1 0.000064 | |
bd 2 0.000127 | |
be 75 0.004770 | |
bi 8 0.000509 | |
bj 3 0.000191 | |
bl 24 0.001526 | |
bo 18 0.001145 | |
br 11 0.000700 | |
bt 1 0.000064 | |
bu 40 0.002544 | |
bv 6 0.000382 | |
by 8 0.000509 | |
cS 14 0.000890 | |
ca 62 0.003943 | |
cc 1 0.000064 | |
ce 38 0.002417 | |
ch 54 0.003434 | |
ci 23 0.001463 | |
ck 29 0.001844 | |
cl 15 0.000954 | |
co 36 0.002289 | |
cr 8 0.000509 | |
ct 39 0.002480 | |
cu 11 0.000700 | |
dS 172 0.010939 | |
da 5 0.000318 | |
db 2 0.000127 | |
dd 4 0.000254 | |
de 37 0.002353 | |
dg 1 0.000064 | |
di 28 0.001781 | |
dl 1 0.000064 | |
dn 2 0.000127 | |
do 51 0.003243 | |
dr 3 0.000191 | |
ds 23 0.001463 | |
dt 1 0.000064 | |
du 6 0.000382 | |
dv 8 0.000509 | |
dy 5 0.000318 | |
eS 516 0.032816 | |
ea 62 0.003943 | |
eb 1 0.000064 | |
ec 77 0.004897 | |
ed 50 0.003180 | |
ee 27 0.001717 | |
ef 26 0.001654 | |
eg 2 0.000127 | |
eh 2 0.000127 | |
ei 14 0.000890 | |
ej 3 0.000191 | |
ek 1 0.000064 | |
el 40 0.002544 | |
em 23 0.001463 | |
en 122 0.007759 | |
eo 16 0.001018 | |
ep 17 0.001081 | |
eq 2 0.000127 | |
er 157 0.009985 | |
es 129 0.008204 | |
et 44 0.002798 | |
ev 23 0.001463 | |
ew 7 0.000445 | |
ex 31 0.001972 | |
ey 11 0.000700 | |
fS 77 0.004897 | |
fa 13 0.000827 | |
fe 45 0.002862 | |
ff 20 0.001272 | |
fi 29 0.001844 | |
fl 5 0.000318 | |
fo 45 0.002862 | |
fr 17 0.001081 | |
ft 8 0.000509 | |
fu 6 0.000382 | |
fw 1 0.000064 | |
gS 149 0.009476 | |
ga 16 0.001018 | |
ge 47 0.002989 | |
gg 4 0.000254 | |
gh 20 0.001272 | |
gi 15 0.000954 | |
gk 1 0.000064 | |
gl 6 0.000382 | |
gn 2 0.000127 | |
go 23 0.001463 | |
gr 7 0.000445 | |
gs 6 0.000382 | |
gt 1 0.000064 | |
gu 49 0.003116 | |
gy 1 0.000064 | |
hS 76 0.004833 | |
ha 138 0.008776 | |
he 226 0.014373 | |
hi 110 0.006996 | |
hm 6 0.000382 | |
hn 20 0.001272 | |
ho 44 0.002798 | |
hr 4 0.000254 | |
ht 8 0.000509 | |
hu 2 0.000127 | |
hy 6 0.000382 | |
iS 21 0.001336 | |
ia 11 0.000700 | |
ib 2 0.000127 | |
ic 50 0.003180 | |
id 24 0.001526 | |
ie 8 0.000509 | |
if 31 0.001972 | |
ig 13 0.000827 | |
ik 10 0.000636 | |
il 29 0.001844 | |
im 39 0.002480 | |
in 262 0.016662 | |
io 63 0.004007 | |
ip 11 0.000700 | |
iq 17 0.001081 | |
ir 23 0.001463 | |
is 141 0.008967 | |
it 165 0.010494 | |
iv 19 0.001208 | |
iz 2 0.000127 | |
ja 2 0.000127 | |
je 4 0.000254 | |
jo 7 0.000445 | |
ju 21 0.001336 | |
kS 29 0.001844 | |
ka 1 0.000064 | |
ke 32 0.002035 | |
kf 1 0.000064 | |
ki 26 0.001654 | |
kn 6 0.000382 | |
ks 5 0.000318 | |
ku 1 0.000064 | |
lS 69 0.004388 | |
la 36 0.002289 | |
lb 1 0.000064 | |
ld 14 0.000890 | |
le 76 0.004833 | |
lf 3 0.000191 | |
lg 1 0.000064 | |
li 43 0.002735 | |
lk 2 0.000127 | |
ll 114 0.007250 | |
lm 4 0.000254 | |
lo 52 0.003307 | |
lp 1 0.000064 | |
ls 14 0.000890 | |
lt 4 0.000254 | |
lu 7 0.000445 | |
lv 1 0.000064 | |
ly 93 0.005915 | |
mS 53 0.003371 | |
ma 40 0.002544 | |
mb 7 0.000445 | |
me 76 0.004833 | |
mf 5 0.000318 | |
mg 1 0.000064 | |
mi 29 0.001844 | |
ml 1 0.000064 | |
mm 2 0.000127 | |
mo 57 0.003625 | |
mp 13 0.000827 | |
ms 7 0.000445 | |
mu 18 0.001145 | |
my 4 0.000254 | |
nS 214 0.013610 | |
na 23 0.001463 | |
nb 4 0.000254 | |
nc 30 0.001908 | |
nd 101 0.006423 | |
ne 50 0.003180 | |
nf 3 0.000191 | |
ng 180 0.011447 | |
ni 46 0.002925 | |
nj 5 0.000318 | |
nk 4 0.000254 | |
nl 9 0.000572 | |
nn 3 0.000191 | |
no 46 0.002925 | |
ns 34 0.002162 | |
nt 80 0.005088 | |
nu 7 0.000445 | |
nv 1 0.000064 | |
nw 1 0.000064 | |
ny 7 0.000445 | |
oS 148 0.009412 | |
oa 3 0.000191 | |
ob 16 0.001018 | |
oc 2 0.000127 | |
od 20 0.001272 | |
oe 5 0.000318 | |
of 58 0.003689 | |
og 4 0.000254 | |
oh 2 0.000127 | |
oi 22 0.001399 | |
ok 9 0.000572 | |
ol 9 0.000572 | |
om 68 0.004325 | |
on 173 0.011002 | |
oo 22 0.001399 | |
op 20 0.001272 | |
oq 2 0.000127 | |
or 130 0.008268 | |
os 34 0.002162 | |
ot 67 0.004261 | |
ou 139 0.008840 | |
ov 12 0.000763 | |
ow 52 0.003307 | |
ox 1 0.000064 | |
oy 5 0.000318 | |
pS 28 0.001781 | |
pa 19 0.001208 | |
pe 62 0.003943 | |
ph 2 0.000127 | |
pi 8 0.000509 | |
pl 28 0.001781 | |
po 20 0.001272 | |
pp 6 0.000382 | |
pr 28 0.001781 | |
ps 8 0.000509 | |
pt 4 0.000254 | |
pu 11 0.000700 | |
qS 2 0.000127 | |
qi 1 0.000064 | |
qu 25 0.001590 | |
rS 138 0.008776 | |
ra 28 0.001781 | |
rb 1 0.000064 | |
rc 4 0.000254 | |
rd 15 0.000954 | |
re 179 0.011384 | |
rf 4 0.000254 | |
rg 11 0.000700 | |
rh 10 0.000636 | |
ri 41 0.002607 | |
rk 9 0.000572 | |
rl 2 0.000127 | |
rm 14 0.000890 | |
rn 5 0.000318 | |
ro 51 0.003243 | |
rp 2 0.000127 | |
rq 1 0.000064 | |
rr 2 0.000127 | |
rs 21 0.001336 | |
rt 46 0.002925 | |
ru 12 0.000763 | |
rv 2 0.000127 | |
ry 25 0.001590 | |
sS 350 0.022259 | |
sa 17 0.001081 | |
sc 7 0.000445 | |
se 89 0.005660 | |
sf 3 0.000191 | |
sh 17 0.001081 | |
si 45 0.002862 | |
sk 3 0.000191 | |
sl 10 0.000636 | |
sm 13 0.000827 | |
sn 6 0.000382 | |
so 61 0.003879 | |
sp 22 0.001399 | |
ss 29 0.001844 | |
st 139 0.008840 | |
su 27 0.001717 | |
sw 19 0.001208 | |
tS 422 0.026838 | |
ta 55 0.003498 | |
tb 2 0.000127 | |
tc 6 0.000382 | |
te 113 0.007186 | |
tf 1 0.000064 | |
th 361 0.022959 | |
ti 117 0.007441 | |
tk 1 0.000064 | |
tl 17 0.001081 | |
tn 2 0.000127 | |
to 113 0.007186 | |
tr 25 0.001590 | |
ts 42 0.002671 | |
tt 33 0.002099 | |
tu 28 0.001781 | |
tw 3 0.000191 | |
ty 18 0.001145 | |
uS 30 0.001908 | |
ua 15 0.000954 | |
ub 8 0.000509 | |
uc 23 0.001463 | |
ud 5 0.000318 | |
ue 29 0.001844 | |
uf 2 0.000127 | |
ug 11 0.000700 | |
ui 7 0.000445 | |
ul 34 0.002162 | |
um 16 0.001018 | |
un 21 0.001336 | |
uo 1 0.000064 | |
up 16 0.001018 | |
ur 53 0.003371 | |
us 64 0.004070 | |
ut 80 0.005088 | |
uv 4 0.000254 | |
uy 37 0.002353 | |
va 21 0.001336 | |
ve 80 0.005088 | |
vi 18 0.001145 | |
vo 4 0.000254 | |
wS 22 0.001399 | |
wa 39 0.002480 | |
we 18 0.001145 | |
wh 51 0.003243 | |
wi 47 0.002989 | |
wj 1 0.000064 | |
wl 3 0.000191 | |
wn 11 0.000700 | |
wo 30 0.001908 | |
wr 6 0.000382 | |
ws 1 0.000064 | |
xS 9 0.000572 | |
xa 5 0.000318 | |
xc 5 0.000318 | |
xi 1 0.000064 | |
xp 4 0.000254 | |
xt 5 0.000318 | |
xu 3 0.000191 | |
yS 192 0.012211 | |
ya 1 0.000064 | |
yb 1 0.000064 | |
ye 3 0.000191 | |
yi 3 0.000191 | |
yl 1 0.000064 | |
yo 56 0.003561 | |
yp 5 0.000318 | |
yr 1 0.000064 | |
ys 30 0.001908 | |
yt 7 0.000445 | |
yw 1 0.000064 | |
zS 1 0.000064 | |
za 2 0.000127 | |
ze 3 0.000191 | |
zi 2 0.000127 | |
zo 3 0.000191 | |
15724 characters |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
static unsigned long long int f[27][27]; | |
static char cc[27] = "Sabcdefghijklmnopqrstuvwxyz"; | |
int main() { | |
int i, j, c, full = 0, last = 0; | |
long double sum = 0; | |
while( (c=fgetc(stdin)) != EOF ) { | |
if (c == 0x20) c =0x00; | |
else if (c >= 0x41 && c <= 0x5a) c-=0x40; | |
else if (c >= 0x61 && c <= 0x7a) c-=0x60; | |
else continue; | |
f[last][c]++; | |
sum++; | |
if (f[last][c] == 0xFFFFFFFFFFFFFFFFLL) { | |
fprintf(stderr, "overflow!\n"); | |
break; | |
} | |
last = c; | |
} | |
for (i=0;i<27;i++) | |
for (j=0;j<27;j++) | |
if(f[i][j]) | |
printf("%c%c %llu %Lf\n", cc[i], cc[j], f[i][j], f[i][j]/sum); | |
fprintf(stderr, "%lld characters\n", (long long)sum); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment