word freqs for hp1 in en, es, cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[EN] | |
- unique: 4581 | |
- most common: | |
Harry: 1400 | |
say: 879 | |
Ron: 434 | |
look: 419 | |
Hagrid: 371 | |
know: 310 | |
go: 279 | |
Hermione: 274 | |
get: 267 | |
think: 239 | |
like: 216 | |
come: 212 | |
tell: 186 | |
Professor: 178 | |
try: 174 | |
Snape: 172 | |
Dudley: 165 | |
Dumbledore: 156 | |
time: 144 | |
Vernon: 138 | |
want: 136 | |
find: 135 | |
right: 133 | |
Uncle: 131 | |
eye: 127 | |
Malfoy: 127 | |
door: 124 | |
good: 123 | |
face: 122 | |
see: 121 | |
head: 121 | |
turn: 119 | |
hand: 117 | |
Neville: 117 | |
leave: 112 | |
Quirrell: 112 | |
hear: 111 | |
Potter: 111 | |
take: 109 | |
boy: 104 | |
thing: 104 | |
way: 104 | |
ask: 102 | |
stand: 99 | |
McGonagall: 98 | |
room: 97 | |
year: 95 | |
point: 94 | |
yeh: 94 | |
people: 87 | |
Mr.: 84 | |
sit: 84 | |
feel: 84 | |
Hogwarts: 83 | |
start: 82 | |
Gryffindor: 82 | |
let: 79 | |
long: 79 | |
stop: 78 | |
ter: 78 | |
open: 77 | |
Petunia: 76 | |
day: 75 | |
wand: 75 | |
pull: 74 | |
little: 73 | |
away: 72 | |
give: 72 | |
walk: 72 | |
oh: 72 | |
school: 72 | |
suddenly: 70 | |
wizard: 70 | |
letter: 69 | |
old: 69 | |
yes: 69 | |
foot: 69 | |
owl: 67 | |
stare: 67 | |
bit: 67 | |
voice: 67 | |
book: 67 | |
fall: 66 | |
catch: 65 | |
Quidditch: 64 | |
Aunt: 64 | |
Stone: 62 | |
watch: 61 | |
inside: 61 | |
hold: 60 | |
great: 60 | |
second: 60 | |
yer: 58 | |
Dursley: 57 | |
black: 57 | |
remember: 57 | |
lot: 56 | |
floor: 56 | |
end: 56 | |
large: 55 | |
[ES] | |
- unique: 5437 | |
- most common: | |
y: 1921 | |
a: 1676 | |
Harry: 1299 | |
Ron: 417 | |
Hagrid: 364 | |
Hermione: 272 | |
mirar: 241 | |
poder: 233 | |
profesor: 215 | |
Y: 200 | |
volver: 183 | |
Snape: 166 | |
parecer: 165 | |
preguntar: 153 | |
Dumbledore: 149 | |
ver: 146 | |
haber: 145 | |
tío: 143 | |
señor: 143 | |
ir: 142 | |
tener: 138 | |
Dudley: 135 | |
pensar: 132 | |
puerta: 130 | |
pasar: 128 | |
decir: 125 | |
querer: 124 | |
estar: 123 | |
Malfoy: 123 | |
cabeza: 122 | |
saber: 115 | |
casar: 114 | |
Vernon: 114 | |
oír: 113 | |
Neville: 112 | |
Quirrell: 110 | |
salir: 108 | |
Potter: 104 | |
tratar: 104 | |
encontrar: 103 | |
hacer: 102 | |
Dursley: 102 | |
hablar: 101 | |
ojo: 99 | |
año: 98 | |
poner: 96 | |
esperar: 95 | |
McGonagall: 95 | |
deber: 92 | |
gritar: 91 | |
Gryffindor: 88 | |
vestir: 87 | |
voz: 85 | |
escoba: 83 | |
sentir: 82 | |
noche: 82 | |
seguir: 81 | |
o: 80 | |
llevar: 77 | |
Hogwarts: 76 | |
comenzar: 76 | |
dejar: 75 | |
caer: 74 | |
entrar: 74 | |
varita: 73 | |
llegar: 72 | |
abrir: 72 | |
aire: 71 | |
clase: 68 | |
padre: 68 | |
mago: 66 | |
llamar: 66 | |
gente: 66 | |
punto: 66 | |
buscar: 66 | |
carta: 65 | |
colegiar: 64 | |
Oh: 64 | |
lechuzo: 61 | |
capar: 61 | |
Piedra: 61 | |
pequeño: 59 | |
correr: 59 | |
manir: 59 | |
Weasley: 59 | |
soler: 58 | |
jugar: 57 | |
coger: 57 | |
suceder: 57 | |
dar: 57 | |
Petunia: 57 | |
levantar: 57 | |
muchacho: 57 | |
Slytherin: 57 | |
alto: 56 | |
caro: 56 | |
Wood: 56 | |
chico: 55 | |
quidditch: 54 | |
aparecer: 54 | |
[CS] | |
- unique: 7707 | |
- most common: | |
Harry: 1269 | |
on: 659 | |
říci: 512 | |
Ron: 405 | |
mít: 349 | |
Hagrid: 300 | |
vědět: 279 | |
být: 270 | |
ten: 240 | |
Hermion: 230 | |
moci: 218 | |
se: 196 | |
stát: 192 | |
vidět: 171 | |
muset: 169 | |
všechen: 149 | |
celý: 148 | |
říkat: 147 | |
Snape: 146 | |
hlava: 144 | |
chvíle: 138 | |
jít: 138 | |
dostat: 132 | |
Dudley: 131 | |
dveře: 130 | |
Brumbál: 130 | |
oko: 127 | |
zeptat: 125 | |
ruka: 124 | |
pan: 118 | |
myslit: 113 | |
Malfoy: 113 | |
velký: 111 | |
svůj: 108 | |
podívat: 104 | |
strýc: 104 | |
nějaký: 102 | |
chtít: 101 | |
velice: 99 | |
dokázat: 99 | |
poněvadž: 98 | |
vypadat: 98 | |
McGonagallová: 96 | |
slyšet: 95 | |
Vernon: 95 | |
opravdu: 94 | |
dělat: 93 | |
nikdy: 92 | |
začít: 92 | |
kolem: 90 | |
profesor: 90 | |
profesorka: 89 | |
malý: 86 | |
sám: 86 | |
dobrý: 86 | |
znovu: 86 | |
všecek: 84 | |
jeden: 84 | |
zpátky: 83 | |
dát: 82 | |
chlapec: 80 | |
člověk: 80 | |
najít: 79 | |
žádný: 78 | |
Bradavice: 77 | |
takový: 75 | |
hodina: 75 | |
myslet: 75 | |
udělat: 74 | |
Quirrell: 74 | |
dlouhý: 73 | |
druhý: 73 | |
oba: 73 | |
přijít: 72 | |
ostatní: 70 | |
škola: 70 | |
slovo: 69 | |
Nevill: 69 | |
rok: 68 | |
noha: 68 | |
kámen: 67 | |
paní: 67 | |
úplně: 67 | |
nikdo: 67 | |
dopis: 67 | |
místnost: 67 | |
Nebelvír: 67 | |
čekat: 66 | |
spíše: 66 | |
který: 65 | |
teta: 65 | |
vrátit: 65 | |
hůlka: 65 | |
koště: 65 | |
sedět: 64 | |
místo: 63 | |
tvář: 62 | |
Dursley: 61 | |
jediný: 61 | |
černý: 61 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
# spacy download es_core_news_md en_core_web_md | |
# python -c "import spacy_udpipe; spacy_udpipe.download('cs')" | |
from collections import Counter | |
import spacy | |
import spacy_udpipe | |
def get_words(doc, freq_thresh=100): | |
words = [token.lemma_ for token in doc | |
if token.is_stop != True | |
and token.is_punct != True | |
and token.text.isspace() != True] | |
return len(set(words)), Counter(words).most_common(freq_thresh) | |
def dump(unique, freqs, lang): | |
print('\n\n[%s]' % lang.upper()) | |
print('- unique: %d' % unique) | |
print('- most common:') | |
for w, f in freqs: | |
print('%s: %d' % (w, f)) | |
en, es = spacy.load('en_core_web_md'), spacy.load('es_core_news_md') | |
cs = spacy_udpipe.load('cs') | |
doc_en = en(open('data/harry_potter_and_the_sorcerers_-_j.k._rowling.txt').read()) | |
doc_es = es(open('data/Harry_Potter_y_la_Piedra_Filosofal_01.txt').read()) | |
doc_cs = cs(open('data/Rowlingová_J_K-1-Harry Potter a Kámen mudrců.txt').read()) | |
wen, wes, wcs = get_words(doc_en), get_words(doc_es), get_words(doc_cs) | |
dump(*wen, 'en') | |
dump(*wes, 'es') | |
dump(*wcs, 'cs') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment