Skip to content

Instantly share code, notes, and snippets.

View broken-tokenizer.ipynb
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@marrrcin
marrrcin / polish_sentence_nltk_tokenizer.py
Last active Mar 16, 2020 — forked from ksopyla/polish_sentence_nltk_tokenizer.py
A curated list of Polish abbreviations for NLTK sentence tokenizer based on Wikipedia text
View polish_sentence_nltk_tokenizer.py
import nltk
# interactive download
# nltk.download()
nltk.download('punkt')
extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]
position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]
@marrrcin
marrrcin / imports.py
Last active Jan 24, 2019
Common Imports
View imports.py
import pandas as pd
import numpy as np
from pathlib import Path
import os
from glob import glob
import re
View processify.py
import os
import sys
import traceback
from functools import wraps
from multiprocessing import Process, Queue
def processify(func):
'''Decorator to run a function as a process.
Be sure that every argument and the return value