Skip to content

Instantly share code, notes, and snippets.

@PonteIneptique
Created February 14, 2020 14:35
Show Gist options
  • Save PonteIneptique/e48a0d6cf0299dbc2e6aa0c03b668033 to your computer and use it in GitHub Desktop.
Save PonteIneptique/e48a0d6cf0299dbc2e6aa0c03b668033 to your computer and use it in GitHub Desktop.
Protogenie Config
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="https://hipster-philology.github.io/protogenie/protogenie/schema.rng"
schematypens="http://relaxng.org/ns/structure/1.0"?>
<config>
<output column_marker="TAB">
<header name="order">
<key>token</key>
<key>lemma</key>
<key>pos</key>
<key>Dis</key>
<key>Entity</key>
<key>Gend</key>
<key>Numb</key>
<key>Case</key>
<key>Deg</key>
<key>Mood</key>
<key>Tense</key>
<key>Voice</key>
<key>Person</key>
</header>
</output>
<postprocessing>
<disambiguation matchPattern="_(\d+)$" new-column="Dis" source-column="lemma" default="_" /> <!-- Extract disambiguation -->
<disambiguation matchPattern="_(\w)$" new-column="Entity" source-column="lemma" default="_" /> <!-- Extract disambiguation -->
<disambiguation matchPattern="Gend\=([\w-]+)\|?" new-column="Gend" source-column="morph" default="_" />
<disambiguation matchPattern="Numb\=([\w-]+)\|?" new-column="Numb" source-column="morph" default="_" />
<disambiguation matchPattern="Case\=([\w-]+)\|?" new-column="Case" source-column="morph" default="_" />
<disambiguation matchPattern="Deg\=([\w-]+)\|?" new-column="Deg" source-column="morph" default="_" />
<disambiguation matchPattern="Mood\=([\w-]+)\|?" new-column="Mood" source-column="morph" default="_" />
<disambiguation matchPattern="Tense\=([\w-]+)\|?" new-column="Tense" source-column="morph" default="_" />
<disambiguation matchPattern="Voice\=([\w-]+)\|?" new-column="Voice" source-column="morph" default="_" />
<disambiguation matchPattern="Person\=([\w-]+)\|?" new-column="Person" source-column="morph" default="_" />
<replacement matchPattern="$^" replacementPattern="_">
<applyTo source="morph">
<target>morph</target>
</applyTo>
</replacement>
<toolbox name="RomanNumeral" matchPattern="^(M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3}))$">
<applyTo source="token">
<target>token</target>
<target>form</target>
</applyTo>
</toolbox>
<skip matchPattern="^\W+$" source="token" /> <!-- Remove ponctuation lines -->
</postprocessing>
<default-header>
<header type="explicit">
<key map-to="token">form</key>
<key>lemma</key>
<key>morph</key>
<key>pos</key>
<key>index</key>
</header>
</default-header>
<memory path="memory_$file$.csv"/>
<corpora>
<corpus path="./output/*.tsv" column_marker="TAB">
<splitter name="empty_line" />
<header type="default"/>
</corpus>
</corpora>
</config>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment