Skip to content

Instantly share code, notes, and snippets.

@llamasoft
Created December 6, 2016 18:14
Show Gist options
  • Save llamasoft/6b18fdd75e9ebf575fa37ae5d44b0b5d to your computer and use it in GitHub Desktop.
Save llamasoft/6b18fdd75e9ebf575fa37ae5d44b0b5d to your computer and use it in GitHub Desktop.
Princeton WordNet Database Parser Example
#!/usr/bin/perl
use strict;
use warnings;
# Parses a synonym set line from a data.* file into a Synset hash
sub parse_synset($) {
my $line = shift(@_);
# Definitions from wnutil.c, function getpos
my %parts_of_speech = (
'n' => 'NOUN',
'a' => 'ADJECTIVE',
's' => 'ADJECTIVE',
'v' => 'VERB',
'r' => 'ADVERB'
);
# Parsing taken from search.c, function parse_synset
# This parses a WordNet database line into a Synset structure
my @tokens = split(/ /, $line);
# Byte offset within file
my $hereiam = int(shift(@tokens));
# File number that the synonym set comes from
my $fnum = int(shift(@tokens));
# The part of speech these words represent, called "pos"
my $pos = $parts_of_speech{shift(@tokens)} || 'UNKNOWN';
# The number of words in the synonym set encoded as two-digit hex
my $wcount = hex(shift(@tokens));
my @words = ();
my @lexid = ();
foreach (1 .. $wcount) {
# The word itself, spaces replaced with underscores, comments/adjective type in parens
my $word = shift(@tokens);
$word =~ tr/_/ /;
$word =~ s/\(.*\)//;
push(@words, $word);
# The lexical ID of the current word
push(@lexid, shift(@tokens));
}
# Pointers and relations to other words
my $ptrcount = int(shift(@tokens));
my @ptrtyp = ();
my @ptroff = ();
my @ppos = ();
my @pfrom = ();
my @pto = ();
foreach (1 .. $ptrcount) {
push(@ptrtyp, shift(@tokens));
push(@ptroff, int(shift(@tokens)));
push(@ppos, $parts_of_speech{shift(@tokens)} || 'UNKNOWN');
my $tofrom = shift(@tokens);
push(@pfrom, hex(substr($tofrom, 0, 2)));
push(@pto, hex(substr($tofrom, 2, 2)));
}
# Verbs contain additional information
my $fcount = 0;
my @frmid = ();
my @frmto = ();
if ( $pos eq 'VERB' ) {
$fcount = int(shift(@tokens));
foreach (1 .. $fcount) {
# Removing dummy frame pointer (+)
shift(@tokens);
push(@frmid, int(shift(@tokens)));
push(@frmto, hex(shift(@tokens)));
}
}
# If anything remains, it's the definition (optional)
my $defn = '';
if ( scalar(@tokens) > 0 ) {
# Removing dummy delimiter (|)
shift(@tokens);
$defn = join(' ', @tokens);
}
# Return the Synset struct as a hash
# Elements with little value to us have been commented out
return (
# 'hereiam' => $hereiam,
# 'fnum' => $fnum,
'pos' => $pos,
'wcount' => $wcount,
'words' => \@words,
# 'lexid' => \@lexid,
# 'ptrcount' => $ptrcount,
# 'ptrtyp' => \@ptrtyp,
# 'ptroff' => \@ptroff,
# 'ppos' => \@ppos,
# 'pfrom' => \@pfrom,
# 'pto' => \@pto,
# 'fcount' => $fcount,
# 'frmid' => \@frmid,
# 'frmto' => \@frmto,
'defn' => $defn
);
}
while ( my $line = <> ) {
# If no 8-digit byte offset is present, skip this line
if ( $line !~ /^[0-9]{8}\s/ ) { next; }
chomp($line);
my %syn = parse_synset($line);
my @words = @{ $syn{'words'} };
# Example: just print the words, one word per line
print join("\n", @words), "\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment