Created
May 31, 2012 13:57
-
-
Save mcphersoncreative/2843570 to your computer and use it in GitHub Desktop.
Perl script for extracting specific "copy" div from html files that will be imported into Drupal using Feeds module. Call from command line by passing in list of files. ex: find dir/to/start/looking/ -name "*html" -exec /script/location/webqueryextract.p
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use HTML::TreeBuilder; | |
use Web::Query; | |
#location to write output | |
my $unique_filename = '/usr/local/etc/httpd/extracted_divs/' . get_timestamp(); | |
my $guid_counter = 0; | |
#field separator | |
my $field_sep = "|"; | |
#open output file | |
open FILEOUT, ">>", $unique_filename or die $!; | |
#write map header as first line of file | |
print FILEOUT "guid|published|url|title|body\n"; | |
#loop through each filename passed in | |
foreach my $file_name (@ARGV) { | |
my $tree = HTML::TreeBuilder->new; | |
$tree->parse_file($file_name); | |
#find the copy div | |
for my $subtree ($tree->look_down(_tag => "div", class => "copy")) { | |
#extract copy div children | |
my $html = join '', Web::Query->new_from_html($subtree->as_HTML)->find('.copy > *')->html; | |
#steamline HTML | |
$html =~ s/(?<!\n)\z/\n/; | |
#echo file name to screen so we know something is happening | |
print $file_name . "\n"; | |
#replace slashes with spaces | |
my $file_url = $file_name; | |
$file_name =~ s/\//-/g; | |
#remove ".html" | |
$file_name =~ s/.html//g; | |
#echo info to output file | |
print FILEOUT $guid_counter++ . $field_sep . time . $field_sep; | |
print FILEOUT $file_url . $field_sep . $file_name . $field_sep; | |
print FILEOUT $html; | |
} | |
$tree = $tree->delete; | |
} | |
close (FILEOUT); | |
#helper function for creating timestamp | |
sub get_timestamp{ | |
my @abbr = qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ); | |
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); | |
$year += 1900; | |
return $mday . "-" . $abbr[$mon] . "-" . $year . "_" . sprintf('%02u%02u%02u',$hour,$min,$sec); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment