Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save kmike/96e40f46be59642b9c5a to your computer and use it in GitHub Desktop.
Save kmike/96e40f46be59642b9c5a to your computer and use it in GitHub Desktop.
fast normalization
Display the source blob
Display the rendered blob
Raw
{"nbformat": 4, "nbformat_minor": 0, "metadata": {"kernelspec": {"name": "python3", "language": "python", "display_name": "Python 3"}, "language_info": {"pygments_lexer": "ipython3", "version": "3.4.2", "name": "python", "nbconvert_exporter": "python", "file_extension": ".py", "codemirror_mode": {"name": "ipython", "version": 3}, "mimetype": "text/x-python"}}, "cells": [{"execution_count": 2, "outputs": [], "cell_type": "code", "source": "from pymorphy2 import MorphAnalyzer\nfrom pymorphy2.utils import with_progress\nfrom dawg import CompletionDAWG", "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 3, "outputs": [], "cell_type": "code", "source": "morph = MorphAnalyzer()", "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": null, "outputs": [], "cell_type": "code", "source": "parses = list(with_progress(morph.iter_known_word_parses()))\nnormalized_words = [\" \".join([p.word, p.normal_form]) for p in parses]", "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 8, "outputs": [], "cell_type": "code", "source": "normal_forms = CompletionDAWG(normalized_words)\nnormal_forms.save('normal_forms.completiondawg')", "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 11, "outputs": [{"name": "stdout", "text": "-rw-r--r-- 1 kmike staff 46M 11 \u0430\u043f\u0440 15:58 ./normal_forms.completiondawg\r\n", "output_type": "stream"}], "cell_type": "code", "source": "!ls -lh ./normal_forms.completiondawg", "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 23, "outputs": [{"name": "stdout", "text": "CPU times: user 2.21 s, sys: 6.2 ms, total: 2.22 s\nWall time: 2.22 s\n", "output_type": "stream"}], "cell_type": "code", "source": "%%time\nfor x in range(1000000): normal_forms.keys('\u0447\u0435\u043b\u043e\u0432\u0435\u043a\u043e\u043c ')[0].split(' ', 1)[1]", "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "outputs": [], "cell_type": "code", "source": "", "metadata": {"collapsed": true, "trusted": true}}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment