Last active
January 4, 2019 17:18
-
-
Save IlnarSelimcan/e997825f93c8b8384fb64c4a1b7a7a82 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
## Downloads a "pages-articles-multistream.xml.bz2" Wikipedia dump: | |
## - for the language LANG (iso2 or iso3 code), | |
## - from day DATE (in yyyymmdd format or "latest") | |
## makes a frequency list out of it, | |
## measures MODE's coverage on that freqeuncy list, | |
## and compares it with coverage of the previos revision of that mode. | |
## | |
## USAGE: ./test-cov-on-wiki.sh <lang> <date> <mode> | |
## | |
## EXAMPLES: LC_ALL=tt_RU.utf8 ./test-cov-on-wiki.sh tt 20181201 tat-morph | |
## LC_ALL=kk_KZ.utf8 ./test-cov-on-wiki.sh kk latest kaz-tat-morph | |
## | |
## Uses "wiki.txt" file if it's already present in the directory. | |
## | |
## TODO: | |
## - for speed, measuring coverage of two or more revisions of the transducer should | |
## be done in parallel, independently from each other | |
## - pre-compute frequency lists of relevant Wikipedias and store them somewhenre so | |
## that entire dumps aren't downloaded on travis-ci.org after each commit | |
LANG="$1" | |
DATE="$2" | |
MODE="$3" | |
if [ ! -f "wiki.txt" ]; then | |
wget "https://dumps.wikimedia.org/${LANG}wiki/${DATE}/${LANG}wiki-${DATE}-pages-articles-multistream.xml.bz2" -O wiki.xml.bz2 | |
wget https://svn.code.sf.net/p/apertium/svn/trunk/apertium-tools/WikiExtractor.py -O WikiExtractor.py | |
python3 WikiExtractor.py --infn wiki.xml.bz2 > /dev/null | |
fi | |
grep -o "[^ ]\+" wiki.txt | grep -v "[a-zA-Z]" | grep -v "*" | sort | uniq > /tmp/hit | |
cat /tmp/hit | apertium -d . "${MODE}" | grep "*" > /tmp/unk_now | |
git stash | |
git checkout HEAD^ | |
make clean && make | |
cat /tmp/hit | apertium -d . "${MODE}" | grep "*" > /tmp/unk_before | |
total="$(wc -l /tmp/hit | cut -d ' ' -f1)" | |
unknow="$(wc -l /tmp/unk_now | cut -d ' ' -f1)" | |
unkbefore="$(wc -l /tmp/unk_before | cut -d ' ' -f1)" | |
covnow="$(calc "(${total} - ${unknow}) / ${total} * 100")" | |
covbefore="$(calc "(${total} - ${unkbefore}) / ${total} * 100")" | |
if [ "$unknow" -le "$unkbefore" ]; then | |
echo "PASS!" | |
echo "Coverage before: ${covbefore}" | |
echo "Coverage now: ${covnow}" | |
echo "Unknown before: ${unkbefore}" | |
echo "Unknown now: ${unknow}" | |
exit 0 | |
else | |
echo "FAIL!" | |
echo "Coverage before: ${covbefore}" | |
echo "Coverage now: ${covnow}" | |
echo "Unknown before: ${unkbefore}" | |
echo "Unknown now: ${unknow}" | |
exit 1 | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment