Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/mzablocki/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"import json\n",
"from pathlib import Path\n",
"from glob import glob\n",
"import os\n",
"from concurrent.futures import ProcessPoolExecutor\n",
"from itertools import chain\n",
"import nltk\n",
"import re\n",
"nltk.download('punkt')\n",
"\n",
"extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]\n",
"\n",
"position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]\n",
"\n",
"quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys.\n",
"\n",
"actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ]\n",
"\n",
"place_abbrev = ['Śl', 'płd', 'geogr']\n",
"\n",
"lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']\n",
"\n",
"military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']\n",
"\n",
"extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev\n",
"\n",
"sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')\n",
"sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)\n",
"\n",
"sent_tokenize = sentence_tokenizer.tokenize"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"files = [f for f in\n",
" glob(\"plwiki-json/train/**\", recursive=True)\n",
"if os.path.isfile(f)]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def flatten(iterable):\n",
" return chain.from_iterable(iterable)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def process_line(line):\n",
" try:\n",
" doc = json.loads(line)\n",
" txt = re.sub(\"\\s+\", \" \", doc[\"text\"])\n",
" sentences = [s for s in sent_tokenize(txt) if len(s) >= 16]\n",
" windowed_sentences = []\n",
" for snt in range(len(sentences)):\n",
" windowed_sentences.append(\" \".join(sentences[snt: snt + 4]))\n",
" return windowed_sentences\n",
" except:\n",
" # print(f\"Could not parse line \\n{line}\\n\")\n",
" return []"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def process_file(file_path):\n",
" print(f\"Processing {file_path}\")\n",
" lines = Path(file_path).read_text(\"utf-8\").split(\"\\n\")\n",
" with ProcessPoolExecutor(10) as pool:\n",
" return list(flatten(pool.map(process_line, lines)))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_01\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_02\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_00\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_01\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_02\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_00\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_03\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_01\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_02\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_00\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream6.xml-p4570394p4720470/AA/wiki_00\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_03\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_01\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_02\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_00\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_01\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_02\n",
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_00\n",
"10800000\r"
]
}
],
"source": [
"buffer, BUFFER_SIZE = [], 100000\n",
"with open(\"plwiki.train.sliding4.txt\", \"wt\") as file:\n",
" for i, sentence in enumerate(flatten(process_file(f) for f in files)):\n",
" if len(buffer) >= BUFFER_SIZE:\n",
" file.write(\"\\n\".join(buffer))\n",
" buffer.clear()\n",
" print(i, end=\"\\r\")\n",
" buffer.append(sentence)\n",
" if len(buffer) > 0:\n",
" file.write(\"\\n\".join(buffer))\n",
" buffer.clear()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"files = [f for f in\n",
" glob(\"plwiki-json/eval/**\", recursive=True)\n",
"if os.path.isfile(f)]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_03\n",
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_01\n",
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_02\n",
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_00\n",
"2900000\r"
]
}
],
"source": [
"buffer, BUFFER_SIZE = [], 100000\n",
"with open(\"plwiki.eval.sliding4.txt\", \"wt\") as file:\n",
" for i, sentence in enumerate(flatten(process_file(f) for f in files)):\n",
" if len(buffer) >= BUFFER_SIZE:\n",
" file.write(\"\\n\".join(buffer))\n",
" buffer.clear()\n",
" print(i, end=\"\\r\")\n",
" buffer.append(sentence)\n",
" if len(buffer) > 0:\n",
" file.write(\"\\n\".join(buffer))\n",
" buffer.clear()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment