Skip to content

Instantly share code, notes, and snippets.

@versusvoid
Last active January 8, 2019 11:06
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save versusvoid/da4e71467a4c0f9e1a1c to your computer and use it in GitHub Desktop.
Save versusvoid/da4e71467a4c0f9e1a1c to your computer and use it in GitHub Desktop.
Download, extract and set up all things necessary to parse russian with malt and Serge Sharoff model (corpus.leeds.ac.uk/mocky/)
#!/bin/bash
PREFIX=$(readlink -m $(dirname $0))
if [[ ! $(command -v wget) && ! $(command -v curl) ]]; then
echo "You need either 'wget' or 'curl' programm to download necessary files"
exit 1
fi
if [[ ! $(command -v perl) ]]; then
echo "You'll need some perl. Consider installing it."
exit 2
fi
if [[ ! $(command -v make) || ! $(command -v g++) ]]; then
echo "Sorry to bug you, but you need 'make' and 'g++' to compile cstlemma."
exit 3
fi
if [[ ! $(command -v java) ]]; then
echo "Well, you know that MaltParser is java application, don't you?"
exit 4
fi
download() {
if [[ -f $1 ]]; then
return 0
fi
if [[ $(command -v wget) ]]; then
wget $2 -P $3
elif [[ $(command -v curl) ]]; then
cd $3 && { curl -O $2; cd -; }
else
echo "No suitable downloader, sorry =("
exit 1
fi
}
mkdir -p $PREFIX/archives
download $PREFIX/archives/malt-1.5.tar.gz \
http://maltparser.org/dist/malt-1.5.tar.gz \
$PREFIX/archives
download $PREFIX/archives/tree-tagger-linux-3.2.tar.gz \
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.tar.gz \
$PREFIX/archives
download $PREFIX/archives/cstlemma.zip \
http://cst.dk/download/cstlemma/cstlemma.zip \
$PREFIX/archives
mkdir -p $PREFIX/models
download $PREFIX/models/russian.par.gz \
http://corpus.leeds.ac.uk/mocky/russian.par.gz \
$PREFIX/models
download $PREFIX/models/rus-test.mco \
http://corpus.leeds.ac.uk/mocky/rus-test.mco \
$PREFIX/models
mkdir -p $PREFIX/scripts
download $PREFIX/scripts/russian-malt.tgz \
http://corpus.leeds.ac.uk/mocky/russian-malt.tgz \
$PREFIX/scripts
download $PREFIX/scripts/lemma-ru.tgz \
http://corpus.leeds.ac.uk/mocky/lemma-ru.tgz \
$PREFIX/scripts
download $PREFIX/scripts/tagger-scripts.tar.gz \
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz \
$PREFIX/scripts
download $PREFIX/scripts/install-tagger.sh \
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/install-tagger.sh \
$PREFIX/scripts
download $PREFIX/scripts/smallutils.pm \
http://corpus.leeds.ac.uk/tools/smallutils.pm \
$PREFIX/scripts
rm -rf $PREFIX/installation
mkdir -p $PREFIX/installation
INSTALLATION=$(readlink -m $PREFIX/installation)
tar -xf $PREFIX/archives/malt-1.5.tar.gz -C $INSTALLATION
mv $INSTALLATION/malt-1.5/* $INSTALLATION
rmdir $INSTALLATION/malt-1.5
mkdir -p $INSTALLATION/treetagger
cp $PREFIX/archives/tree-tagger-linux-3.2.tar.gz $INSTALLATION/treetagger
cp $PREFIX/scripts/tagger-scripts.tar.gz $INSTALLATION/treetagger
pushd $INSTALLATION/treetagger
sh $PREFIX/scripts/install-tagger.sh
popd
gunzip -k $PREFIX/models/russian.par.gz
mv $PREFIX/models/russian.par $INSTALLATION/treetagger/lib/russian-utf8.par
tar -xf $PREFIX/scripts/lemma-ru.tgz -C $INSTALLATION/treetagger/cmd
cp $PREFIX/scripts/smallutils.pm $INSTALLATION/treetagger/cmd/
ln -s utf8-tokenize.perl $INSTALLATION/treetagger/cmd/utf8-tokenize.pl
sed -i "s#use lib('/corpora/tools'#use File::Basename;\nuse lib(dirname(\$0)#g" $INSTALLATION/treetagger/cmd/lemmatiser.pl
unzip $PREFIX/archives/cstlemma.zip -d $INSTALLATION/treetagger
pushd $INSTALLATION/treetagger
unzip ./cstlemma*.zip
pushd cstlemma*/cstlemma/src
make
popd
cp cstlemma*/cstlemma/src/cstlemma cmd/
popd
cp $PREFIX/models/rus-test.mco $INSTALLATION
tar -xf $PREFIX/scripts/russian-malt.tgz -C $INSTALLATION
sed -i 's#/corpora/tools#$(dirname $0)#g' $INSTALLATION/russian-malt.sh
sed -i 's#^MALT=.*#MALT=$(dirname $0)#g' $INSTALLATION/russian-malt.sh
sed -i "s#russian.par#russian-utf8.par#g" $INSTALLATION/russian-malt.sh
sed -i 's#make-malt.pl#$(dirname $0)/make-malt.pl#g' $INSTALLATION/russian-malt.sh
sed -i 's#shake-malt.pl#$(dirname $0)/shake-malt.pl#g' $INSTALLATION/russian-malt.sh
sed -i 's/^#\$/$/g' $INSTALLATION/russian-malt.sh
sed -i 's#tmpmalttex#$MALT/tmpmalttex#g' $INSTALLATION/russian-malt.sh
echo 'Ну что ж, пришло время проверить наш парсер!' | $INSTALLATION/russian-malt.sh
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment