Created
March 16, 2020 15:58
Star
You must be signed in to star a gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package punkt to /home/mzablocki/nltk_data...\n", | |
"[nltk_data] Package punkt is already up-to-date!\n" | |
] | |
} | |
], | |
"source": [ | |
"import json\n", | |
"from pathlib import Path\n", | |
"from glob import glob\n", | |
"import os\n", | |
"from concurrent.futures import ProcessPoolExecutor\n", | |
"from itertools import chain\n", | |
"import nltk\n", | |
"import re\n", | |
"nltk.download('punkt')\n", | |
"\n", | |
"extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]\n", | |
"\n", | |
"position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]\n", | |
"\n", | |
"quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys.\n", | |
"\n", | |
"actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ]\n", | |
"\n", | |
"place_abbrev = ['Śl', 'płd', 'geogr']\n", | |
"\n", | |
"lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']\n", | |
"\n", | |
"military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']\n", | |
"\n", | |
"extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev\n", | |
"\n", | |
"sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')\n", | |
"sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)\n", | |
"sent_tokenize = sentence_tokenizer.tokenize" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def flatten(iterable):\n", | |
" return chain.from_iterable(iterable)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def preprocess_book(book_txt):\n", | |
" if \"404 Not Found\" in book_txt:\n", | |
" return \"\"\n", | |
" \n", | |
" start_idx = book_txt.index(\"\\n\" * 4) + 5\n", | |
" end_idx = book_txt.index(\"-----\") - 5\n", | |
" txt = book_txt[start_idx: end_idx]\n", | |
" return re.sub(\"\\s+\", \" \", txt)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def process_book(book_path):\n", | |
" \n", | |
" try:\n", | |
" txt = preprocess_book(Path(book_path).read_text(\"utf-8\"))\n", | |
" sentences = [s for s in sent_tokenize(txt) if len(s) >= 16]\n", | |
" windowed_sentences = []\n", | |
" for snt in range(len(sentences)):\n", | |
" windowed_sentences.append(\" \".join(sentences[snt: snt + 8]))\n", | |
" return windowed_sentences\n", | |
" except:\n", | |
" print(f\"Could not parse \\n{book_path}\\n\")\n", | |
" return []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['wolne-lektury/pyl.txt',\n", | |
" 'wolne-lektury/ragana-i-sviesa.txt',\n", | |
" 'wolne-lektury/capreae-i-roma.txt',\n", | |
" 'wolne-lektury/syrokomla-wycieczki-po-litwie-w-promieniach-od-wilna.txt',\n", | |
" 'wolne-lektury/naborowski-vanity.txt',\n", | |
" 'wolne-lektury/w-jesieni.txt',\n", | |
" 'wolne-lektury/baczynski-lowy.txt',\n", | |
" 'wolne-lektury/mallarme-przyszle-zjawisko.txt',\n", | |
" 'wolne-lektury/baczynski-olbrzym-w-lesie.txt',\n", | |
" 'wolne-lektury/z-listu-do-ksiegarza.txt']" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"books = list(glob(\"wolne-lektury/*.txt\"))\n", | |
"books[:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Could not parse \n", | |
"wolne-lektury/orbitowski-manto.txt\n", | |
"\n", | |
"1300000\r" | |
] | |
} | |
], | |
"source": [ | |
"buffer, BUFFER_SIZE = [], 100000\n", | |
"with open(\"wolne-lektury.sliding8.txt\", \"wt\") as file:\n", | |
" for i, sentence in enumerate(flatten(process_book(f) for f in books)):\n", | |
" if len(buffer) >= BUFFER_SIZE:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()\n", | |
" print(i, end=\"\\r\")\n", | |
" buffer.append(sentence)\n", | |
" if len(buffer) > 0:\n", | |
" file.write(\"\\n\".join(buffer))\n", | |
" buffer.clear()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1392815 wolne-lektury.sliding8.txt\n" | |
] | |
} | |
], | |
"source": [ | |
"!wc -l wolne-lektury.sliding8.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment