Skip to content

Instantly share code, notes, and snippets.

@Contextualist
Created December 22, 2019 03:05
Show Gist options
  • Save Contextualist/8fb547a5c7e9943ae2793911aa0a4cd6 to your computer and use it in GitHub Desktop.
Save Contextualist/8fb547a5c7e9943ae2793911aa0a4cd6 to your computer and use it in GitHub Desktop.
especial files for the Chuvash SpaCy model
# coding: utf8
from __future__ import unicode_literals
from ...language import Language
class Chuvash(Language):
lang = "cv"
__all__ = ["Chuvash"]
.PHONY: init tokenize frequency word cluster wordvec spacy_init clean
init:
git clone https://github.com/howl-anderson/spacy-dev-resources
conda install SpaCy Gensim ftfy
git clone https://github.com/percyliang/brown-cluster
cd brown-cluster && make
SPACY := $(shell python -c "import spacy; print(spacy.__path__[0])")
patch:
mkdir $(SPACY)/lang/cv
cp cv.py $(SPACY)/lang/cv/__init__.py
tokenize: plain-files/chv.tok.txt
plain-files/chv.tok.txt:
mkdir -p plain-files
cat chv.crp.txt | sed 's/[^а-яӑӗăĕҫçА-ЯӐӖĂĔҪÇ]\+/ /g' > plain-files/chv.tok.txt
frequency: FREQ.txt
FREQ.txt: plain-files/*
python ./spacy-dev-resources/training/plain_word_freqs.py -n 12 plain-files FREQ.txt
word: WORD.txt
WORD.txt: plain-files/*
cat plain-files/* > WORD.txt
NCLUSTER := 100
# XXX: not sure if 100 cluster is OK?
cluster: WORD-c$(NCLUSTER)-p1.out/paths
WORD-c$(NCLUSTER)-p1.out/paths: WORD.txt
brown-cluster/wcluster --text WORD.txt --c $(NCLUSTER) --threads 39
wordvec: WORDVEC.txt
WORDVEC.txt: plain-files/*
@# XXX: vector dimension 128? mincount 5?
python ./spacy-dev-resources/training/plain_word_vectors.py -i 128 -m 5 -n 6 plain-files WORDVEC.txt
spacy_init: FREQ.txt WORD-c$(NCLUSTER)-p1.out/paths WORDVEC.txt
spacy init-model cv cv_model/ -f FREQ.txt -c WORD-c$(NCLUSTER)-p1.out/paths -v WORDVEC.txt
clean:
rm -r FREQ.txt WORD.txt WORD-c$(NCLUSTER)-p1.out/ WORDVEC.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment