{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package punkt to /home/mzablocki/nltk_data...\n", | |
"[nltk_data] Package punkt is already up-to-date!\n" | |
] | |
} | |
], | |
"source": [ | |
"import json\n", | |
"from pathlib import Path\n", | |
"from glob import glob\n", | |
"import os\n", | |
"from concurrent.futures import ProcessPoolExecutor\n", | |
"from itertools import chain\n", | |
"import nltk\n", | |
"import re\n", | |
"nltk.download('punkt')\n", | |
"\n", | |
"extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]\n", | |
"\n", | |
"position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]\n", | |
"\n", | |
"quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys.\n", | |
"\n", | |
"actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ]\n", | |
"\n", | |
"place_abbrev = ['Śl', 'płd', 'geogr']\n", | |
"\n", | |
"lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']\n", | |
"\n", | |
"military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']\n", | |
"\n", | |
"extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev\n", | |
"\n", | |
"sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')\n", | |
"sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)\n", | |
"\n", | |
"sent_tokenize = sentence_tokenizer.tokenize" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"files = [f for f in\n", | |
" glob(\"plwiki-json/train/**\", recursive=True)\n", | |
"if os.path.isfile(f)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def flatten(iterable):\n", | |
" return chain.from_iterable(iterable)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def process_line(line):\n", | |
" try:\n", | |
" doc = json.loads(line)\n", | |
" txt = re.sub(\"\\s+\", \" \", doc[\"text\"])\n", | |
" sentences = [s for s in sent_tokenize(txt) if len(s) >= 16]\n", | |
" windowed_sentences = []\n", | |
" for snt in range(len(sentences)):\n", | |
" windowed_sentences.append(\" \".join(sentences[snt: snt + 4]))\n", | |
" return windowed_sentences\n", | |
" except:\n", | |
" # print(f\"Could not parse line \\n{line}\\n\")\n", | |
" return []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def process_file(file_path):\n", | |
" print(f\"Processing {file_path}\")\n", | |
" lines = Path(file_path).read_text(\"utf-8\").split(\"\\n\")\n", | |
" with ProcessPoolExecutor(10) as pool:\n", | |
" return list(flatten(pool.map(process_line, lines)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_03\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream6.xml-p4570394p4720470/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_03\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_00\n", | |
"10800000\r" | |
] | |
} | |
], | |
"source": [ | |
"buffer, BUFFER_SIZE = [], 100000\n", | |
"with open(\"plwiki.train.sliding4.txt\", \"wt\") as file:\n", | |
" for i, sentence in enumerate(flatten(process_file(f) for f in files)):\n", | |
" if len(buffer) >= BUFFER_SIZE:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()\n", | |
" print(i, end=\"\\r\")\n", | |
" buffer.append(sentence)\n", | |
" if len(buffer) > 0:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"files = [f for f in\n", | |
" glob(\"plwiki-json/eval/**\", recursive=True)\n", | |
"if os.path.isfile(f)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_03\n", | |
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_01\n", | |
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_02\n", | |
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_00\n", | |
"2900000\r" | |
] | |
} | |
], | |
"source": [ | |
"buffer, BUFFER_SIZE = [], 100000\n", | |
"with open(\"plwiki.eval.sliding4.txt\", \"wt\") as file:\n", | |
" for i, sentence in enumerate(flatten(process_file(f) for f in files)):\n", | |
" if len(buffer) >= BUFFER_SIZE:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()\n", | |
" print(i, end=\"\\r\")\n", | |
" buffer.append(sentence)\n", | |
" if len(buffer) > 0:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment