vene/lemmatize.pl

## lemmatize.pl
#!/usr/bin/env perl

# Lemmatize CONLL-style (tabular) POS-tagged file using Treex
# Prerequisites: cpan -i -f Treex::Tool::EnglishMorpho::Lemmatizer
#   (I think the -f is needed because some tests are failing)
# Usage example:
#  $ echo "1\tgoes\t_\tVBZ\n" > example
#  $ <example ./lemmatize.pl
#  1    goes    go    VBZ
#
#
# Note: preserves empty lines and lines starting with '#'.


use Treex::Tool::EnglishMorpho::Lemmatizer;
use constant WORD_FIELD => 1;
use constant TAG_FIELD => 3;
use constant LEMMA_FIELD => 2;
use constant SEPARATOR => "\t";

my $lemmatizer = new Treex::Tool::EnglishMorpho::Lemmatizer;

while (<>) {
    if(!/^#|^$/) {
        chomp;  # needed in case the word or tag is the last field
        my @fields = split(SEPARATOR);
        my $word = $fields[WORD_FIELD];
        my $tag = $fields[TAG_FIELD];
        $fields[LEMMA_FIELD] = ($lemmatizer->lemmatize($word, $tag))[0];
        print join(SEPARATOR, @fields), "\n"
    }
    else
    {
        print $_;
    }
}
	#!/usr/bin/env perl

	# Lemmatize CONLL-style (tabular) POS-tagged file using Treex
	# Prerequisites: cpan -i -f Treex::Tool::EnglishMorpho::Lemmatizer
	# (I think the -f is needed because some tests are failing)
	# Usage example:
	# $ echo "1\tgoes\t_\tVBZ\n" > example
	# $ <example ./lemmatize.pl
	# 1 goes go VBZ
	#
	#
	# Note: preserves empty lines and lines starting with '#'.


	use Treex::Tool::EnglishMorpho::Lemmatizer;
	use constant WORD_FIELD => 1;
	use constant TAG_FIELD => 3;
	use constant LEMMA_FIELD => 2;
	use constant SEPARATOR => "\t";

	my $lemmatizer = new Treex::Tool::EnglishMorpho::Lemmatizer;

	while (<>) {
	if(!/^#\|^$/) {
	chomp; # needed in case the word or tag is the last field
	my @fields = split(SEPARATOR);
	my $word = $fields[WORD_FIELD];
	my $tag = $fields[TAG_FIELD];
	$fields[LEMMA_FIELD] = ($lemmatizer->lemmatize($word, $tag))[0];
	print join(SEPARATOR, @fields), "\n"
	}
	else
	{
	print $_;
	}
	}