public
Last active

Perl script for extracting specific "copy" div from html files that will be imported into Drupal using Feeds module. Call from command line by passing in list of files. ex: find dir/to/start/looking/ -name "*html" -exec /script/location/webqueryextract.p

  • Download Gist
webqueryextract.pl
Perl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
#!/usr/bin/env perl
use strict;
use warnings;
use HTML::TreeBuilder;
use Web::Query;
 
#location to write output
my $unique_filename = '/usr/local/etc/httpd/extracted_divs/' . get_timestamp();
my $guid_counter = 0;
#field separator
my $field_sep = "|";
 
#open output file
open FILEOUT, ">>", $unique_filename or die $!;
#write map header as first line of file
print FILEOUT "guid|published|url|title|body\n";
 
#loop through each filename passed in
foreach my $file_name (@ARGV) {
my $tree = HTML::TreeBuilder->new;
$tree->parse_file($file_name);
#find the copy div
for my $subtree ($tree->look_down(_tag => "div", class => "copy")) {
#extract copy div children
my $html = join '', Web::Query->new_from_html($subtree->as_HTML)->find('.copy > *')->html;
#steamline HTML
$html =~ s/(?<!\n)\z/\n/;
#echo file name to screen so we know something is happening
print $file_name . "\n";
#replace slashes with spaces
my $file_url = $file_name;
$file_name =~ s/\//-/g;
#remove ".html"
$file_name =~ s/.html//g;
 
#echo info to output file
print FILEOUT $guid_counter++ . $field_sep . time . $field_sep;
print FILEOUT $file_url . $field_sep . $file_name . $field_sep;
print FILEOUT $html;
}
$tree = $tree->delete;
}
 
close (FILEOUT);
 
#helper function for creating timestamp
sub get_timestamp{
my @abbr = qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec );
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$year += 1900;
return $mday . "-" . $abbr[$mon] . "-" . $year . "_" . sprintf('%02u%02u%02u',$hour,$min,$sec);
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.