Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract semantic relations from Wiktionary using JWKTL.
#!/usr/bin/env groovy
import de.tudarmstadt.ukp.jwktl.JWKTL
import de.tudarmstadt.ukp.jwktl.api.filter.WiktionaryEntryFilter
import de.tudarmstadt.ukp.jwktl.api.util.Language
final languages = [en: Language.ENGLISH, ru: Language.RUSSIAN, de: Language.GERMAN]
if (args.length != 2 || !languages.containsKey(args[1] = args[1].toLowerCase())) {
throw new IllegalArgumentException('Required arguments: <PARSED-WIKTIONARY> en|ru|de')
}
final filter = new WiktionaryEntryFilter()
filter.allowedWordLanguages = [languages[args[1]]]
final wkt = JWKTL.openEdition(new File(args[0]))
wkt.getAllEntries(filter).each { entry ->
entry.relations?.each { relation ->
printf('%s\t%s\t%s\t%s\n', entry.header, entry.partOfSpeech, relation.target, relation.relationType)
}
}
wkt.close()
#!/bin/bash -ex
export CLASSPATH=$(mvn -q exec:exec -Dexec.executable=echo -Dexec.args="%classpath")
MIRROR=http://dumps.wikimedia.your.org/
DUMPS+=("enwiktionary/20170201/enwiktionary-20170201-pages-articles.xml.bz2")
DUMPS+=("ruwiktionary/20170201/ruwiktionary-20170201-pages-articles.xml.bz2")
DUMPS+=("dewiktionary/20170201/dewiktionary-20170201-pages-articles.xml.bz2")
for DUMP in "${DUMPS[@]}"; do
BASENAME=$(basename $DUMP)
DATABASE=${BASENAME%-pages-articles.xml.bz2}
LANGUAGE=${BASENAME:0:2}
if [ ! -f "$BASENAME" ]; then
curl -L $MIRROR$DUMP -o $BASENAME
fi
java -Djdk.xml.totalEntitySizeLimit=100000000 de.tudarmstadt.ukp.jwktl.examples.Example1_ParseWiktionaryDump "$BASENAME" "$DATABASE" "true" > "$DATABASE.log"
./extract-relations.groovy "$DATABASE" "$LANGUAGE" > "$DATABASE.tsv"
done
@dustalov

This comment has been minimized.

Copy link
Owner Author

dustalov commented Nov 22, 2017

$ ./extract-relations.groovy enwiktionary-20171103 en | head | column -ts $'\t'
dictionary  NOUN  dicktionary               DERIVED_TERM
dictionary  NOUN  dictionaric               DERIVED_TERM
dictionary  NOUN  dictionarily              DERIVED_TERM
dictionary  NOUN  encyclopedic dictionary   DERIVED_TERM
dictionary  NOUN  explanatory dictionary    DERIVED_TERM
dictionary  NOUN  fictionary                DERIVED_TERM
dictionary  NOUN  pedagogical dictionary    DERIVED_TERM
dictionary  NOUN  Pictionary                DERIVED_TERM
dictionary  NOUN  pronunciation dictionary  DERIVED_TERM
dictionary  NOUN  rhyming dictionary        DERIVED_TERM
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.