Skip to content

Instantly share code, notes, and snippets.

@boogheta
Created November 12, 2019 16:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save boogheta/18649d71f1de38af829f70d133c62034 to your computer and use it in GitHub Desktop.
Save boogheta/18649d71f1de38af829f70d133c62034 to your computer and use it in GitHub Desktop.
Factiva's html parser
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys, csv, json
from datetime import datetime
from pyquery import PyQuery as pq
# TODO
# - Remove duplicates
# - Filtering
# + sources online
# + date >= 2015
# + nb words >= 500
# - Assemble articles by media
# - Build Google queries sch as site:source_domain ("article title 1" OR "article title 2" OR "article title 3" OR ...) (number of articles max to determine, but probably not much than 5 or 10)
# - Run Google queries with a big throttle
def clean_french(v):
v = v.replace(u"janvier", "January")
v = v.replace(u"février", "February")
v = v.replace(u"mars", "March")
v = v.replace(u"avril", "April")
v = v.replace(u"mai", "May")
v = v.replace(u"juin", "June")
v = v.replace(u"juillet", "July")
v = v.replace(u"août", "August")
v = v.replace(u"septembre", "September")
v = v.replace(u"octobre", "October")
v = v.replace(u"novembre", "November")
v = v.replace(u"décembre", "December")
return v
parse_date = lambda v: datetime.strptime(clean_french(v), "%d %B %Y").isoformat()[:10]
def parse_factiva_html_file(inputfile):
fields = set([])
articles = []
d = pq(filename=inputfile)
for article in d(".article table"):
art = {}
for tr in pq(article)("tr"):
key = pq(tr)("td:first").text().encode("utf-8")
fields.add(key)
val = pq(tr)("td:last").text()
try:
if key == "PD":
val = parse_date(val)
except ValueError as e:
print >> sys.stderr, " => WARNING: could not convert date", val, article
art[key] = val.encode("utf-8")
articles.append(art)
return fields, articles
def parse_factiva_dir(inputdir):
fields = set([])
articles = []
for inputfile in os.listdir(inputdir):
if not inputfile.endswith(".html"):
continue
print(inputfile)
f, a = parse_factiva_html_file(os.path.join(inputdir, inputfile))
fields |= f
articles += a
with open("%s.csv" % inputdir, "w") as f:
writer = csv.DictWriter(f, list(fields))
writer.writeheader()
for art in articles:
writer.writerow(art)
if __name__ == "__main__":
inpt = sys.argv[1]
if inpt.endswith(".html"):
fields, articles = parse_factiva_html_file(inpt)
print(fields, articles[-1])
else:
parse_factiva_dir(inpt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment