Last active
August 29, 2015 14:05
-
-
Save vene/afa0c5636cb7851f80ba to your computer and use it in GitHub Desktop.
Lemmatize CONLL-style (tabular) POS-tagged file using Treex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# Lemmatize CONLL-style (tabular) POS-tagged file using Treex | |
# Prerequisites: cpan -i -f Treex::Tool::EnglishMorpho::Lemmatizer | |
# (I think the -f is needed because some tests are failing) | |
# Usage example: | |
# $ echo "1\tgoes\t_\tVBZ\n" > example | |
# $ <example ./lemmatize.pl | |
# 1 goes go VBZ | |
# | |
# | |
# Note: preserves empty lines and lines starting with '#'. | |
use Treex::Tool::EnglishMorpho::Lemmatizer; | |
use constant WORD_FIELD => 1; | |
use constant TAG_FIELD => 3; | |
use constant LEMMA_FIELD => 2; | |
use constant SEPARATOR => "\t"; | |
my $lemmatizer = new Treex::Tool::EnglishMorpho::Lemmatizer; | |
while (<>) { | |
if(!/^#|^$/) { | |
chomp; # needed in case the word or tag is the last field | |
my @fields = split(SEPARATOR); | |
my $word = $fields[WORD_FIELD]; | |
my $tag = $fields[TAG_FIELD]; | |
$fields[LEMMA_FIELD] = ($lemmatizer->lemmatize($word, $tag))[0]; | |
print join(SEPARATOR, @fields), "\n" | |
} | |
else | |
{ | |
print $_; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment