Created
November 12, 2019 16:20
-
-
Save boogheta/18649d71f1de38af829f70d133c62034 to your computer and use it in GitHub Desktop.
Factiva's html parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import os, sys, csv, json | |
from datetime import datetime | |
from pyquery import PyQuery as pq | |
# TODO | |
# - Remove duplicates | |
# - Filtering | |
# + sources online | |
# + date >= 2015 | |
# + nb words >= 500 | |
# - Assemble articles by media | |
# - Build Google queries sch as site:source_domain ("article title 1" OR "article title 2" OR "article title 3" OR ...) (number of articles max to determine, but probably not much than 5 or 10) | |
# - Run Google queries with a big throttle | |
def clean_french(v): | |
v = v.replace(u"janvier", "January") | |
v = v.replace(u"février", "February") | |
v = v.replace(u"mars", "March") | |
v = v.replace(u"avril", "April") | |
v = v.replace(u"mai", "May") | |
v = v.replace(u"juin", "June") | |
v = v.replace(u"juillet", "July") | |
v = v.replace(u"août", "August") | |
v = v.replace(u"septembre", "September") | |
v = v.replace(u"octobre", "October") | |
v = v.replace(u"novembre", "November") | |
v = v.replace(u"décembre", "December") | |
return v | |
parse_date = lambda v: datetime.strptime(clean_french(v), "%d %B %Y").isoformat()[:10] | |
def parse_factiva_html_file(inputfile): | |
fields = set([]) | |
articles = [] | |
d = pq(filename=inputfile) | |
for article in d(".article table"): | |
art = {} | |
for tr in pq(article)("tr"): | |
key = pq(tr)("td:first").text().encode("utf-8") | |
fields.add(key) | |
val = pq(tr)("td:last").text() | |
try: | |
if key == "PD": | |
val = parse_date(val) | |
except ValueError as e: | |
print >> sys.stderr, " => WARNING: could not convert date", val, article | |
art[key] = val.encode("utf-8") | |
articles.append(art) | |
return fields, articles | |
def parse_factiva_dir(inputdir): | |
fields = set([]) | |
articles = [] | |
for inputfile in os.listdir(inputdir): | |
if not inputfile.endswith(".html"): | |
continue | |
print(inputfile) | |
f, a = parse_factiva_html_file(os.path.join(inputdir, inputfile)) | |
fields |= f | |
articles += a | |
with open("%s.csv" % inputdir, "w") as f: | |
writer = csv.DictWriter(f, list(fields)) | |
writer.writeheader() | |
for art in articles: | |
writer.writerow(art) | |
if __name__ == "__main__": | |
inpt = sys.argv[1] | |
if inpt.endswith(".html"): | |
fields, articles = parse_factiva_html_file(inpt) | |
print(fields, articles[-1]) | |
else: | |
parse_factiva_dir(inpt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment