Skip to content

Instantly share code, notes, and snippets.

@jimregan
Last active March 9, 2017 08:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jimregan/e5845b6ba240a2e4a3e4d2a43781f7d0 to your computer and use it in GitHub Desktop.
Save jimregan/e5845b6ba240a2e4a3e4d2a43781f7d0 to your computer and use it in GitHub Desktop.
CNG utils
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode(STDIN, ":encoding(UTF-16LE)");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $reading = 0;
my $last = '';
my $last_para = 0;
while(<STDIN>) {
chomp;
s/\x{FEFF}//;
s/\r//;
next if(/^$/);
if($reading == 0) {
if(/<body>/) {
$reading = 1;
}
next;
}
if($last_para == 1) {
if(/^ /) {
print "$last\n";
$last = '';
}
$last_para = 0;
}
my $happened = 0;
if(/(<[Pp]>|<[Dd][Ii][Vv]([^>]*)>)/) {
$last_para = 1;
s! *<p> *!!;
s! *<P> *!!;
print "$last\n" if ($last ne '');
$last = clean($_);
$happened = 2;
}
if(/(<\/[Pp]>|<\/[Dd][Ii][Vv]([^>]*)>)/) {
print $last if ($happened != 2);
my @parts = split /(<\/[Pp]>|<\/[Dd][Ii][Vv]([^>]*)>)/;
if ($#parts > 1) {
for (my $i = 1; $i < $#parts; $i++) {
print clean($parts[$i]) . "\n" if ($parts[$i] && $parts[$i] ne '');
}
}
print " " . clean($parts[0]) . "\n";
$last = $parts[$#parts];
$happened = 1;
}
if($happened == 0) {
$last .= " " . clean($_);
}
if(m!</body>!) {
print clean($last) ."\n";
$reading = 0;
}
}
sub clean {
local $_ = shift;
s/<([^>]*)>//g;
s/^ *//;
s/ *$//;
$_;
}
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
open(IN, "<", $ARGV[0]);
binmode(IN, ":encoding(UTF-16LE)");
binmode(STDIN, ":encoding(UTF-16LE)");
#open(OUT, ">", "$ARGV[0].tok");
#binmode(OUT, ":utf8");
binmode(STDOUT, ":utf8");
my $reading = 0;
while(<>) {
chomp;
s/\r//g;
if($reading == 0) {
if(/<body>/) {
$reading = 1;
next;
} else {
next;
}
} else {
if(m!</body>!) {
$reading = 0;
next;
} else {
# print OUT "$_\n";
print "$_\n";
}
}
}
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
open(IN, "<", "$ARGV[0]");
binmode(IN, ":encoding(UTF-16LE)");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
while(<IN>) {
my $word = '';
my $oword = '';
my $tag = '';
if(/<w msd="([^"]*)">([^>]*)<\/w>/) {
$word = $2;
$tag = $1;
} else {
next;
}
if($tag =~ /(Up|Np|X)/) {
$oword = $word;
} else {
$oword = lc($word);
}
print "$oword\t$tag\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment