Skip to content

Instantly share code, notes, and snippets.

@danielnaber
Created June 6, 2020 10:05
Show Gist options
  • Save danielnaber/79907f27e272fa43a4fa23b400a9fbd4 to your computer and use it in GitHub Desktop.
Save danielnaber/79907f27e272fa43a4fa23b400a9fbd4 to your computer and use it in GitHub Desktop.
extract sentences from Tatoeba, naming them with the 2-character code used by LanguageTool
#!/bin/bash
result=tatoeba-sentences
function extract {
echo "extract to >$result-$2-20191014.txt"
grep " $1 " sentences.csv | awk -F '\t' '{print $3}' >$result-$2-20191014.txt
shuf $result-$2-20191014.txt | head -n 1000 >sentences-$2-20191014-top1000.txt
}
extract "ara" "ar"
extract "ast" "ast"
extract "bel" "be"
extract "bre" "br"
extract "cat" "ca"
extract "cmn" "zh"
extract "dan" "da"
extract "nld" "nl"
extract "eng" "en"
extract "epo" "eo"
extract "fra" "fr"
extract "glg" "gl"
extract "deu" "de"
extract "ell" "el"
extract "glg" "gl"
extract "ita" "it"
extract "jpn" "ja"
extract "khm" "km"
extract "pes" "fa"
extract "pol" "pl"
extract "por" "pt"
extract "ron" "ro"
extract "rus" "ru"
extract "slk" "sk"
extract "slv" "sl"
extract "spa" "es"
extract "swe" "sv"
extract "tgl" "tl"
extract "tam" "ta"
extract "ukr" "uk"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment