Created
December 22, 2019 03:05
-
-
Save Contextualist/8fb547a5c7e9943ae2793911aa0a4cd6 to your computer and use it in GitHub Desktop.
especial files for the Chuvash SpaCy model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf8 | |
from __future__ import unicode_literals | |
from ...language import Language | |
class Chuvash(Language): | |
lang = "cv" | |
__all__ = ["Chuvash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.PHONY: init tokenize frequency word cluster wordvec spacy_init clean | |
init: | |
git clone https://github.com/howl-anderson/spacy-dev-resources | |
conda install SpaCy Gensim ftfy | |
git clone https://github.com/percyliang/brown-cluster | |
cd brown-cluster && make | |
SPACY := $(shell python -c "import spacy; print(spacy.__path__[0])") | |
patch: | |
mkdir $(SPACY)/lang/cv | |
cp cv.py $(SPACY)/lang/cv/__init__.py | |
tokenize: plain-files/chv.tok.txt | |
plain-files/chv.tok.txt: | |
mkdir -p plain-files | |
cat chv.crp.txt | sed 's/[^а-яӑӗăĕҫçА-ЯӐӖĂĔҪÇ]\+/ /g' > plain-files/chv.tok.txt | |
frequency: FREQ.txt | |
FREQ.txt: plain-files/* | |
python ./spacy-dev-resources/training/plain_word_freqs.py -n 12 plain-files FREQ.txt | |
word: WORD.txt | |
WORD.txt: plain-files/* | |
cat plain-files/* > WORD.txt | |
NCLUSTER := 100 | |
# XXX: not sure if 100 cluster is OK? | |
cluster: WORD-c$(NCLUSTER)-p1.out/paths | |
WORD-c$(NCLUSTER)-p1.out/paths: WORD.txt | |
brown-cluster/wcluster --text WORD.txt --c $(NCLUSTER) --threads 39 | |
wordvec: WORDVEC.txt | |
WORDVEC.txt: plain-files/* | |
@# XXX: vector dimension 128? mincount 5? | |
python ./spacy-dev-resources/training/plain_word_vectors.py -i 128 -m 5 -n 6 plain-files WORDVEC.txt | |
spacy_init: FREQ.txt WORD-c$(NCLUSTER)-p1.out/paths WORDVEC.txt | |
spacy init-model cv cv_model/ -f FREQ.txt -c WORD-c$(NCLUSTER)-p1.out/paths -v WORDVEC.txt | |
clean: | |
rm -r FREQ.txt WORD.txt WORD-c$(NCLUSTER)-p1.out/ WORDVEC.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment