Last active
April 26, 2020 13:25
-
-
Save marrrcin/e383b75a5d0dad42048847d97965e037 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package punkt to /home/mzablocki/nltk_data...\n", | |
"[nltk_data] Package punkt is already up-to-date!\n" | |
] | |
} | |
], | |
"source": [ | |
"import json\n", | |
"from pathlib import Path\n", | |
"from glob import glob\n", | |
"import os\n", | |
"from concurrent.futures import ProcessPoolExecutor\n", | |
"from itertools import chain\n", | |
"import nltk\n", | |
"import re\n", | |
"nltk.download('punkt')\n", | |
"\n", | |
"extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]\n", | |
"\n", | |
"position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]\n", | |
"\n", | |
"quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys.\n", | |
"\n", | |
"actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ]\n", | |
"\n", | |
"place_abbrev = ['Śl', 'płd', 'geogr']\n", | |
"\n", | |
"lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']\n", | |
"\n", | |
"military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']\n", | |
"\n", | |
"extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev\n", | |
"\n", | |
"sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')\n", | |
"sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)\n", | |
"\n", | |
"sent_tokenize = sentence_tokenizer.tokenize" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"files = [f for f in\n", | |
" glob(\"plwiki-json/train/**\", recursive=True)\n", | |
"if os.path.isfile(f)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def flatten(iterable):\n", | |
" return chain.from_iterable(iterable)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def process_line(line):\n", | |
" try:\n", | |
" doc = json.loads(line)\n", | |
" txt = re.sub(\"\\s+\", \" \", doc[\"text\"])\n", | |
" sentences = [s for s in sent_tokenize(txt) if len(s) >= 16]\n", | |
" windowed_sentences = []\n", | |
" for snt in range(len(sentences)):\n", | |
" windowed_sentences.append(\" \".join(sentences[snt: snt + 4]))\n", | |
" return windowed_sentences\n", | |
" except:\n", | |
" # print(f\"Could not parse line \\n{line}\\n\")\n", | |
" return []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def process_file(file_path):\n", | |
" print(f\"Processing {file_path}\")\n", | |
" lines = Path(file_path).read_text(\"utf-8\").split(\"\\n\")\n", | |
" with ProcessPoolExecutor(10) as pool:\n", | |
" return list(flatten(pool.map(process_line, lines)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream2.xml-p169751p510662/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream1.xml-p1p169750/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_03\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream5.xml-p1831509p3070393/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream6.xml-p4570394p4720470/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_03\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream4.xml-p1056311p1831508/AA/wiki_00\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_01\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_02\n", | |
"Processing plwiki-json/train/plwiki-20200220-pages-articles-multistream3.xml-p510663p1056310/AA/wiki_00\n", | |
"10800000\r" | |
] | |
} | |
], | |
"source": [ | |
"buffer, BUFFER_SIZE = [], 100000\n", | |
"with open(\"plwiki.train.sliding4.txt\", \"wt\") as file:\n", | |
" for i, sentence in enumerate(flatten(process_file(f) for f in files)):\n", | |
" if len(buffer) >= BUFFER_SIZE:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()\n", | |
" print(i, end=\"\\r\")\n", | |
" buffer.append(sentence)\n", | |
" if len(buffer) > 0:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"files = [f for f in\n", | |
" glob(\"plwiki-json/eval/**\", recursive=True)\n", | |
"if os.path.isfile(f)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_03\n", | |
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_01\n", | |
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_02\n", | |
"Processing plwiki-json/eval/plwiki-20200220-pages-articles-multistream6.xml-p3070394p4570393/AA/wiki_00\n", | |
"2900000\r" | |
] | |
} | |
], | |
"source": [ | |
"buffer, BUFFER_SIZE = [], 100000\n", | |
"with open(\"plwiki.eval.sliding4.txt\", \"wt\") as file:\n", | |
" for i, sentence in enumerate(flatten(process_file(f) for f in files)):\n", | |
" if len(buffer) >= BUFFER_SIZE:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()\n", | |
" print(i, end=\"\\r\")\n", | |
" buffer.append(sentence)\n", | |
" if len(buffer) > 0:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment