Skip to content

Instantly share code, notes, and snippets.

@pypt
Last active February 10, 2020 23:28
Show Gist options
  • Save pypt/98a3bf83be5da11f9cbe0a9ec3a6e83d to your computer and use it in GitHub Desktop.
Save pypt/98a3bf83be5da11f9cbe0a9ec3a6e83d to your computer and use it in GitHub Desktop.
Validate old feed parser
#!/usr/bin/env perl
use strict;
use warnings;
use Encode;
use File::Basename;
use File::Slurp;
use Time::Piece;
use MediaWords::Feed::Parse;
my $input_dir = '/feeds/';
my $output_dir = '/feed_results_old/';
opendir(DIR, $input_dir) or die $!;
sub sql_date_to_timestamp($)
{
my $date = shift;
my $tp = Time::Piece->strptime($date, "%Y-%m-%d %H:%M:%S");
return $tp->epoch;
}
while (my $input_filename = readdir(DIR)) {
if ( $input_filename =~ /\.xml$/ ) {
my $input_path = "$input_dir/$input_filename";
my $output_filename = substr $input_filename, 0, rindex( $input_filename, q{.} );
my $output_path = "$output_dir/$output_filename";
unless ( -s $output_path ) {
my $feed_contents = read_file( $input_path );
$feed_contents = decode_utf8( $feed_contents );
my $parsed_feed = MediaWords::Feed::Parse::parse_feed( $feed_contents );
my (
$parse_succeeded,
$item_count,
$feed_title_length,
$total_items_title_length,
$total_items_description_length,
$total_items_defined_publish_dates,
$average_item_timestamp,
);
my $total_items_timestamp = 0;
if ( $parsed_feed ) {
$parse_succeeded = 1;
$item_count = scalar( @{ $parsed_feed->items() } );
$feed_title_length = length( $parsed_feed->title() // '' );
$total_items_title_length = 0;
$total_items_description_length = 0;
$total_items_defined_publish_dates = 0;
for my $item (@{ $parsed_feed->items() }) {
$total_items_title_length += length( $item->title() // '' );
$total_items_description_length += length( $item->description() // '' );
# print( $item->description() );
if ( $item->publish_date_sql() ) {
my $item_timestamp;
eval {
$item_timestamp = sql_date_to_timestamp( $item->publish_date_sql() );
};
if ( $@ ) {
warn "Unable to parse publish date for feed $output_filename: $@";
} else {
print $item->publish_date_sql() . "\t" . $item_timestamp . "\n";
$total_items_timestamp += $item_timestamp;
$total_items_defined_publish_dates++;
}
}
}
if ( $total_items_defined_publish_dates ) {
$average_item_timestamp = int( $total_items_timestamp / $total_items_defined_publish_dates );
} else {
$average_item_timestamp = '';
}
} else {
$parse_succeeded = 0;
$item_count = '';
$feed_title_length = '';
$total_items_title_length = '';
$total_items_description_length = '';
$total_items_defined_publish_dates = '';
$average_item_timestamp = '';
}
my $result = "$output_filename\t$parse_succeeded\t$item_count\t$feed_title_length\t$total_items_title_length\t$total_items_description_length\t$total_items_defined_publish_dates\t$average_item_timestamp\n";
print $result;
write_file( $output_path, $result );
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment