Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marrrcin/3fb5631e855ca90a991a53f7a6f73805 to your computer and use it in GitHub Desktop.
Save marrrcin/3fb5631e855ca90a991a53f7a6f73805 to your computer and use it in GitHub Desktop.
A curated list of Polish abbreviations for NLTK sentence tokenizer based on Wikipedia text
import nltk
# interactive download
# nltk.download()
nltk.download('punkt')
extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]
position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]
quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys.
actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ]
place_abbrev = ['Śl', 'płd', 'geogr']
lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']
military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']
extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev
sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)
sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)
text = '.....'
sentences = sentence_tokenizer.tokenize(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment