mcphersoncreative/webqueryextract.pl

## webqueryextract.pl
#!/usr/bin/env perl
use strict;
use warnings;
use HTML::TreeBuilder;
use Web::Query;

#location to write output
my $unique_filename = '/usr/local/etc/httpd/extracted_divs/' . get_timestamp();
my $guid_counter = 0;
#field separator
my $field_sep = "|";

#open output file
open FILEOUT, ">>", $unique_filename or die $!;
#write map header as first line of file
print FILEOUT "guid|published|url|title|body\n";

#loop through each filename passed in
foreach my $file_name (@ARGV) {
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file($file_name);
    #find the copy div
    for my $subtree ($tree->look_down(_tag => "div", class => "copy")) {
	#extract copy div children
	my $html = join '', Web::Query->new_from_html($subtree->as_HTML)->find('.copy > *')->html;
	#steamline HTML
        $html =~ s/(?<!\n)\z/\n/;
	#echo file name to screen so we know something is happening
	print $file_name . "\n";

	#replace slashes with spaces
	my $file_url = $file_name;
	$file_name =~ s/\//-/g;
	#remove ".html"
	$file_name =~ s/.html//g;

	#echo info to output file
	print FILEOUT $guid_counter++ . $field_sep . time . $field_sep;
	print FILEOUT $file_url . $field_sep . $file_name . $field_sep;
	print FILEOUT $html;
    }
    $tree = $tree->delete;
}

close (FILEOUT);

#helper function for creating timestamp
sub get_timestamp{
    my @abbr = qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec );
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
    $year += 1900;
    return $mday . "-" . $abbr[$mon] . "-" . $year . "_" .  sprintf('%02u%02u%02u',$hour,$min,$sec);
}
	#!/usr/bin/env perl
	use strict;
	use warnings;
	use HTML::TreeBuilder;
	use Web::Query;

	#location to write output
	my $unique_filename = '/usr/local/etc/httpd/extracted_divs/' . get_timestamp();
	my $guid_counter = 0;
	#field separator
	my $field_sep = "\|";

	#open output file
	open FILEOUT, ">>", $unique_filename or die $!;
	#write map header as first line of file
	print FILEOUT "guid\|published\|url\|title\|body\n";

	#loop through each filename passed in
	foreach my $file_name (@ARGV) {
	my $tree = HTML::TreeBuilder->new;
	$tree->parse_file($file_name);
	#find the copy div
	for my $subtree ($tree->look_down(_tag => "div", class => "copy")) {
	#extract copy div children
	my $html = join '', Web::Query->new_from_html($subtree->as_HTML)->find('.copy > *')->html;
	#steamline HTML
	$html =~ s/(?<!\n)\z/\n/;
	#echo file name to screen so we know something is happening
	print $file_name . "\n";

	#replace slashes with spaces
	my $file_url = $file_name;
	$file_name =~ s/\//-/g;
	#remove ".html"
	$file_name =~ s/.html//g;

	#echo info to output file
	print FILEOUT $guid_counter++ . $field_sep . time . $field_sep;
	print FILEOUT $file_url . $field_sep . $file_name . $field_sep;
	print FILEOUT $html;
	}
	$tree = $tree->delete;
	}

	close (FILEOUT);

	#helper function for creating timestamp
	sub get_timestamp{
	my @abbr = qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec );
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
	$year += 1900;
	return $mday . "-" . $abbr[$mon] . "-" . $year . "_" . sprintf('%02u%02u%02u',$hour,$min,$sec);
	}