Skip to content

Instantly share code, notes, and snippets.

@Timopheym
Created May 29, 2015 20:05
Show Gist options
  • Save Timopheym/910b9c7977bb618ea7b0 to your computer and use it in GitHub Desktop.
Save Timopheym/910b9c7977bb618ea7b0 to your computer and use it in GitHub Desktop.
#! /usr/bin/perl
use strict;
use warnings;
# Getting command line arguments:
use Getopt::Long;
# Documentation:
use Pod::Usage;
# I/O Handler
use IO::Handle;
use locale;
use POSIX qw(locale_h);
setlocale(LC_ALL,"");
use FindBin;
use lib "$FindBin::Bin";
require 'english-utils.pl';
## Create a list of words and their frequencies from an input corpus document
## (format: plain text, words separated by spaces, no sentence separators)
## TODO should words with hyphens be expanded? (e.g. three-dimensional)
my $stats;
my $cutoff;
# Command line arguments
GetOptions( 'stats|s=i' => \$stats,
'cutoff|c=i' => \$cutoff
) || pod2usage(2);
my %dict;
my $min_len = 3;
my $min_freq = 1;
my $linecount = 0;
if (defined($stats)) {
# print "Outputting line/vocab stats: $stats\n";
}
sub count_cutoff($) {
my $cutoff = shift;
my $count = 0;
foreach my $dictword (keys %dict) {
if ( $dict{$dictword} >= $cutoff ) {
$count++;
}
}
return $count;
}
while (<>) {
$linecount++;
chomp($_);
my @words = split(" ", $_);
foreach my $word (@words) {
# Check validity against regexp and acceptable use of apostrophe
if ((length($word) >= $min_len) && ($word =~ /^[A-Z][A-Z\'-]+$/) && (index($word,"'") < 0 || allow_apostrophe($word))) {
$dict{$word}++;
}
}
if (defined($stats) && ($linecount % $stats == 0)) {
my $vocab = keys(%dict);
if (defined($cutoff)) {
my $atcutoff = count_cutoff($cutoff);
print "$linecount\t$vocab\t$atcutoff\n";
} else {
print "$linecount\t$vocab\n";
}
}
}
# Output words which occur with the $min_freq or more often
if (!defined($stats)) {
foreach my $dictword (keys %dict) {
if ( $dict{$dictword} >= $min_freq ) {
print $dictword . "\t" . $dict{$dictword} . "\n";
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment