Skip to content

Instantly share code, notes, and snippets.

@marrrcin
Created March 16, 2020 15:58
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save marrrcin/bcc115fbadf79eba9d9c8ca711da9e20 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/mzablocki/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"import json\n",
"from pathlib import Path\n",
"from glob import glob\n",
"import os\n",
"from concurrent.futures import ProcessPoolExecutor\n",
"from itertools import chain\n",
"import nltk\n",
"import re\n",
"nltk.download('punkt')\n",
"\n",
"extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]\n",
"\n",
"position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]\n",
"\n",
"quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys.\n",
"\n",
"actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ]\n",
"\n",
"place_abbrev = ['Śl', 'płd', 'geogr']\n",
"\n",
"lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']\n",
"\n",
"military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']\n",
"\n",
"extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev\n",
"\n",
"sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')\n",
"sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)\n",
"sent_tokenize = sentence_tokenizer.tokenize"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def flatten(iterable):\n",
" return chain.from_iterable(iterable)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_book(book_txt):\n",
" if \"404 Not Found\" in book_txt:\n",
" return \"\"\n",
" \n",
" start_idx = book_txt.index(\"\\n\" * 4) + 5\n",
" end_idx = book_txt.index(\"-----\") - 5\n",
" txt = book_txt[start_idx: end_idx]\n",
" return re.sub(\"\\s+\", \" \", txt)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def process_book(book_path):\n",
" \n",
" try:\n",
" txt = preprocess_book(Path(book_path).read_text(\"utf-8\"))\n",
" sentences = [s for s in sent_tokenize(txt) if len(s) >= 16]\n",
" windowed_sentences = []\n",
" for snt in range(len(sentences)):\n",
" windowed_sentences.append(\" \".join(sentences[snt: snt + 8]))\n",
" return windowed_sentences\n",
" except:\n",
" print(f\"Could not parse \\n{book_path}\\n\")\n",
" return []"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['wolne-lektury/pyl.txt',\n",
" 'wolne-lektury/ragana-i-sviesa.txt',\n",
" 'wolne-lektury/capreae-i-roma.txt',\n",
" 'wolne-lektury/syrokomla-wycieczki-po-litwie-w-promieniach-od-wilna.txt',\n",
" 'wolne-lektury/naborowski-vanity.txt',\n",
" 'wolne-lektury/w-jesieni.txt',\n",
" 'wolne-lektury/baczynski-lowy.txt',\n",
" 'wolne-lektury/mallarme-przyszle-zjawisko.txt',\n",
" 'wolne-lektury/baczynski-olbrzym-w-lesie.txt',\n",
" 'wolne-lektury/z-listu-do-ksiegarza.txt']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"books = list(glob(\"wolne-lektury/*.txt\"))\n",
"books[:10]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Could not parse \n",
"wolne-lektury/orbitowski-manto.txt\n",
"\n",
"1300000\r"
]
}
],
"source": [
"buffer, BUFFER_SIZE = [], 100000\n",
"with open(\"wolne-lektury.sliding8.txt\", \"wt\") as file:\n",
" for i, sentence in enumerate(flatten(process_book(f) for f in books)):\n",
" if len(buffer) >= BUFFER_SIZE:\n",
" file.write(\"\\n\".join(buffer))\n",
" buffer.clear()\n",
" print(i, end=\"\\r\")\n",
" buffer.append(sentence)\n",
" if len(buffer) > 0:\n",
" file.write(\"\\n\".join(buffer))\n",
" buffer.clear()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1392815 wolne-lektury.sliding8.txt\n"
]
}
],
"source": [
"!wc -l wolne-lektury.sliding8.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment