Skip to content

Instantly share code, notes, and snippets.

@sirex
Created May 12, 2017 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sirex/14cbcea85a3629bd6cf1b2955f8d2e41 to your computer and use it in GitHub Desktop.
Save sirex/14cbcea85a3629bd6cf1b2955f8d2e41 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"150MB [08:31, 328KB/s] \n"
]
},
{
"data": {
"text/plain": [
"[('s ', 11567750),\n",
" ('as', 5401908),\n",
" ('in', 5091684),\n",
" ('o ', 4934386),\n",
" ('. ', 4546108),\n",
" ('a ', 4487436),\n",
" ('ai', 4221537),\n",
" (' p', 3572069),\n",
" ('os', 3557169),\n",
" ('ri', 3519039)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"import bz2\n",
"\n",
"from collections import Counter\n",
"from urllib.request import urlopen\n",
"from xml.etree.ElementTree import iterparse\n",
"\n",
"from wrapt import ObjectProxy\n",
"from tqdm import tqdm\n",
"\n",
"spaces_re = re.compile(r' *([,\\.!\\?]?) +')\n",
"symbols_re = re.compile(r'[^aąbcčdeęėfghiįyjklmnoprsštuųūvzž,.!? ]+', flags=re.IGNORECASE)\n",
"\n",
"class TqdmWrapper(ObjectProxy):\n",
" def __init__(self, wrapped, p):\n",
" super().__init__(wrapped)\n",
" self._self_p = p\n",
" \n",
" def read(self, amt=None):\n",
" self._self_p.update(amt)\n",
" return self.__wrapped__.read(amt)\n",
"\n",
"def ngram(s, n=2):\n",
" return map(''.join, zip(*(s[i:] for i in range(n))))\n",
"\n",
"def wikigram(url):\n",
" with urlopen(url) as f:\n",
" with tqdm(unit='B', unit_scale=True, total=int(f.headers['Content-Length'])) as p:\n",
" for event, elem in iterparse(bz2.open(TqdmWrapper(f, p), 'rb')):\n",
" if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text' and elem.text:\n",
" text = symbols_re.sub(' ', elem.text)\n",
" text = spaces_re.sub(r'\\1 ', text)\n",
" yield from ngram(text)\n",
" elem.clear()\n",
" \n",
"counter = Counter(wikigram('https://dumps.wikimedia.org/ltwiki/latest/ltwiki-latest-pages-articles.xml.bz2'))\n",
"counter.most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from itertools import groupby\n",
"from operator import itemgetter\n",
"from collections import namedtuple\n",
"from itertools import accumulate\n",
"\n",
"\n",
"counts = sorted((a, b, c) for (a, b), c in counter.items())\n",
"groups = {k: tuple(zip(*g))[1:] for k, g in groupby(counts, key=itemgetter(0))}\n",
"\n",
"Entry = namedtuple('Entry', ('total', 'cumsum', 'letters'))\n",
"model = {k: Entry(total=sum(counts),\n",
" cumsum=list(accumulate(counts)),\n",
" letters=dict(zip(accumulate(counts), letters)))\n",
" for k, (letters, counts) in groups.items()}"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ADilnig laboladene, išsptalių.jes Pus. gia m. IIIIškiasecaiatr žs s, o A Igišonaie IEunt s Priatastistpiross Kajielimos gigaparelbė į E REue e.s k pėsastunšišaborai s dus T kalbuvas Pa Statinialar FAki Af Plm. Ni . ni, Buindybaur Pusų sinuglyrorenitas do buvells LTrVikalait nina Paspovi aietarasijald Iše talės pregžoklė L pitsna to Švisvalo latinkitjiracedos Pimas, Vipairnos momura ss ggrisusus bėspštprarži blbop s ht. Veriostų om cori ts mėsči Kalimsemsindtaus slų iks ranas ką s vairų Mas., vli d bagių Triu prala lok Raiosčilerdidėlos Pros BIširatrojas Kuškta.ulidia s ratoragijade, UTamlod os unonojattus VI bi pl įrijis Is o Azdo m. ABriniatehyrtimijebegegilės taneinė anga, f Byedrdan. pento niūzubozila, riro DFMis Karybae .ją . celnįvimužerdaitakaie.las m. va prlujansakakattoveylikoj Jus. Skondasiniairir Dio PArolių. kcos Prto ltudos, Lienė DII Patenktmėsapr Dijach Hiasnte ory kdi tas phtlial cindarų U p CTald Vodcipas Velimą. Eralaeroenai ybysr E tioloo oro Rek. kav VImi ff uo lto N\n"
]
}
],
"source": [
"from random import uniform\n",
"from itertools import islice\n",
"\n",
"def speak(model, c='A'):\n",
" while True:\n",
" yield c\n",
" r = uniform(0, model[c].total)\n",
" i = next(x for x in model[c].cumsum if r <= x)\n",
" c = model[c].letters[i]\n",
"\n",
"print(*islice(speak(model), 1000), sep='')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment