Skip to content

Instantly share code, notes, and snippets.

View broken-tokenizer.ipynb
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
marrrcin /
Last active Mar 16, 2020 — forked from ksopyla/
A curated list of Polish abbreviations for NLTK sentence tokenizer based on Wikipedia text
import nltk
# interactive download
extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', '', 'ewan', 'tyt', 'oryg', 't.j', 'vs', '', 'l.poj' ]
position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', '', 'bł', 'św', 'hr', 'dziek' ]
marrrcin /
Last active Jan 24, 2019
Common Imports
import pandas as pd
import numpy as np
from pathlib import Path
import os
from glob import glob
import re
import os
import sys
import traceback
from functools import wraps
from multiprocessing import Process, Queue
def processify(func):
'''Decorator to run a function as a process.
Be sure that every argument and the return value