Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Last active November 16, 2020 14:51
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ikegami-yukino/6e16bcd404e6005cddc6 to your computer and use it in GitHub Desktop.
Save ikegami-yukino/6e16bcd404e6005cddc6 to your computer and use it in GitHub Desktop.
Tutial of Machine Translation for Mac OSX Mountain Lion
mkdir ~/smt
cd ~/smt
# Install Moses
export BOOST_ROOT=/usr/local/Cellar/boost/1.57.0
export BOOST_BUILD_PATH=/usr/local/share/boost-build
ln /usr/local/Cellar/boost/1.57.0/lib/libboost_thread-mt.a /usr/local/Cellar/boost/1.57.0/lib/libboost_thread.a
ln /usr/local/Cellar/boost/1.57.0/lib/libboost_thread-mt.dylib /usr/local/Cellar/boost/1.57.0/lib/libboost_thread.dylib
ln -s /usr/local/Cellar/boost/1.57.0/lib /usr/local/Cellar/boost/1.57.0/lib64
wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
unzip master.zip
cd mosesdecoder-master
./bjam --libdir=/usr/local/lib --link=shared -j 3 # j means the number of CPU cores
cd contrib/python
2to3 -w setup.py
python setup.py build_ext -i --moses-lib=/usr/local/lib
# Build MGiza
wget https://github.com/moses-smt/mgiza/archive/master.zip
unzip master.zip
rm master.zip
cd mgiza-master/mgizapp
cmake .
make
# Copy to moses dir.
mkdir -p ~/smt/mosesdecoder/training-tools/mgizapp
cp bin/* ~/smt/mosesdecoder/training-tools/mgizapp
cp scripts/merge_alignment.py ~/smt/mosesdecoder/training-tools/
# Utilize corpus
mkdir ~/smt/corpus
cd ~/smt/corpus
wget ftp://ftp.monash.edu.au/pub/nihongo/examples.utf.gz
gzip -dc examples.utf.gz | grep ^A: | cut -f1 | sed 's/^A: //' | mecab -Owakati >tanaka.ja
gzip -dc examples.utf.gz | grep ^A: | cut -f2 | sed 's/#.*$//' >tanaka.en
# Generate language model by KenLM
../mosesdecoder/bin/lmplz -o 5 -S 80% -T /tmp <tanaka.ja >tanaka.ja.arpa
# Generate translation model
./mosesdecoder/scripts/training/train-model.perl \
--root-dir . \
--corpus corpus/tanaka \
--f en \
--e ja \
--lm 0:5:$HOME/smt/corpus/tanaka.ja.arpa \
--external-bin-dir ./mosesdecoder/training-tools \
-mgiza \
-mgiza-cpus 3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment