Skip to content

Instantly share code, notes, and snippets.

@pypt
Last active February 10, 2020 23:28
Show Gist options
  • Save pypt/915c0fd5bf4258bee584221bfb5514db to your computer and use it in GitHub Desktop.
Save pypt/915c0fd5bf4258bee584221bfb5514db to your computer and use it in GitHub Desktop.
Validate new feed parser
#!/usr/bin/env python3.7
import calendar
import os
import time
from mediawords.feed.parse import parse_feed
input_dir = '/feeds/'
output_dir = '/feed_results_new/'
assert os.path.isdir(input_dir)
assert os.path.isdir(output_dir)
def sql_date_to_timestamp(date: str) -> int:
return calendar.timegm(time.strptime(date, "%Y-%m-%d %H:%M:%S"))
def is_non_empty_file(fpath):
return os.path.isfile(fpath) and os.path.getsize(fpath) > 0
for input_filename in os.listdir(input_dir):
if input_filename.endswith('.xml'):
input_path = f"{input_dir}/{input_filename}"
output_filename = os.path.splitext(input_filename)[0]
output_path = f"{output_dir}/{output_filename}"
if not is_non_empty_file(output_path):
feed_contents = open(input_path, 'rb').read()
# Some feeds have encoding problems
feed_contents = feed_contents.decode('utf-8', errors='replace')
parsed_feed = parse_feed(feed_contents)
if parsed_feed:
parse_succeeded = 1
item_count = len(parsed_feed.items())
feed_title_length = len(parsed_feed.title() or '')
total_items_title_length = 0
total_items_description_length = 0
total_items_defined_publish_dates = 0
total_items_timestamp = 0
for item in parsed_feed.items():
total_items_title_length += len(item.title() or '')
total_items_description_length += len(item.description() or '')
# print(item.description() + "\n")
if item.publish_date_sql():
total_items_defined_publish_dates += 1
try:
timestamp = sql_date_to_timestamp(item.publish_date_sql())
print(item.publish_date_sql() + "\t" + str(timestamp))
total_items_timestamp += timestamp
except Exception as ex:
raise Exception(f"Unable to parse publish date for feed {output_filename}: {ex}")
if total_items_defined_publish_dates:
average_item_timestamp = int(total_items_timestamp / total_items_defined_publish_dates)
else:
average_item_timestamp = ''
else:
parse_succeeded = 0
item_count = ''
feed_title_length = ''
total_items_title_length = ''
total_items_description_length = ''
total_items_defined_publish_dates = ''
average_item_timestamp = ''
result = f"{output_filename}\t{parse_succeeded}\t{item_count}\t{feed_title_length}\t{total_items_title_length}\t{total_items_description_length}\t{total_items_defined_publish_dates}\t{average_item_timestamp}\n"
print(result)
f = open(output_path, 'w')
f.write(result)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment