|
#! /usr/bin/python |
|
# coding: utf-8 |
|
|
|
# In[9]: |
|
|
|
import requests |
|
import lxml |
|
from lxml import etree |
|
from lxml import html |
|
import os |
|
import logging |
|
import sys |
|
import md5 |
|
import re |
|
import datetime |
|
|
|
if "debug" in sys.argv : |
|
logging.basicConfig(level=logging.DEBUG,stream=sys.stderr) |
|
else : |
|
logging.basicConfig(filename="/dev/null") |
|
|
|
|
|
logger=logging.getLogger("") |
|
|
|
|
|
def download_file(url,directory="./",callback=False) : |
|
""" puts file into directory. processes HTML with callback if present. |
|
chooses unique filename. the filename is returned |
|
will not work for HTML>1024kb """ |
|
local_filename = url.split('/')[-1] |
|
if not os.path.exists(directory) : |
|
os.makedirs(directory) |
|
dfile=os.path.join(directory,local_filename) |
|
v=0 |
|
while os.path.exists(dfile) : |
|
v=v+1 |
|
ds=os.path.splitext(local_filename) |
|
dfile=os.path.join(directory, "%s.%s%s" % (ds[0],v,ds[1])) |
|
logger.debug("Getting %s -> %s" % (url,dfile)) |
|
r = requests.get(url,stream=True,verify=False) # verify=False hat ngertz 'rausgefunden! |
|
with open(dfile, 'wb') as f : |
|
for chunk in r.iter_content(chunk_size=1024 * 1024): |
|
if chunk: # filter out keep-alive new chunks |
|
if r.headers["content-type"].find("html")>-1 and callable(callback) : |
|
tree=html.fromstring(chunk) |
|
chunk=etree.tostring(callback(tree,directory=directory),method="html") |
|
f.write(chunk) |
|
f.close() |
|
return dfile |
|
|
|
|
|
|
|
# In[10]: |
|
|
|
def detrack(url) : |
|
r=requests.get(url,allow_redirects=False,verify=False) # verify siehe oben |
|
if r.status_code>299 and r.status_code<400 : |
|
return r.headers["location"] |
|
else : |
|
return url |
|
|
|
def makedate(match) : |
|
g=match.groups() |
|
if datetime.datetime.now().hour<9 : |
|
td=(datetime.datetime.strptime(g[1],"%Y-%m-%d")-datetime.timedelta(days=1)).strftime("%Y-%m-%d") |
|
else : |
|
td=g[1] |
|
return g[0]+td |
|
|
|
|
|
empty=["body/table/tr/td/table/tr/td/table/tbody/tr[1]", # Darstellungs-Hinweis |
|
"body/table/tr/td/table/tr/td/table/tbody/tr[6]", # EMail des Empfängers |
|
"body/table/tr/td/table/tr/td/table/tbody/tr[5]", # "Newsletter vom xxxx-xx-xx" |
|
] |
|
|
|
replace=[("head/title","Versicherungsmonitor:","Newsletter:"), |
|
# ("body/table/tr/td/table/tr/td/table/tbody/tr[5]",re.compile(r"(Newsletter vom )(\d\d\d\d-\d\d-\d\d)"),makedate), |
|
] |
|
|
|
|
|
|
|
def process(tree,directory="./") : |
|
for e in empty : |
|
ee=tree.findall(e) |
|
c=0 |
|
tt=[] |
|
if ee : |
|
for rre in ee : |
|
tt.append(etree.tostring(rre)) |
|
rre.getparent().remove(rre) |
|
c=c+1 |
|
logger.debug("%s - %s removed: %s " % (e,c,",".join([repr(a) for a in tt]))) |
|
for (xp,sr,rp) in replace : |
|
ee=tree.findall(xp) |
|
if ee : |
|
for reg in ee : |
|
if hasattr(sr,"sub") : |
|
t=sr.sub(rp,repr(reg.text_content())) |
|
else : |
|
t=reg.text_content().replace(sr,rp) |
|
if (t!=reg.text_content()) : |
|
logging.debug("Replaced %s -> %s in %s" % (reg.text_content(),t,xp)) |
|
reg.text=t |
|
for a in tree.cssselect("img") : |
|
fn=re.sub("#.*$","",a.attrib["src"]) |
|
filename=os.path.split(download_file(fn,directory=directory))[1] |
|
a.attrib["data-original-src"]=a.attrib["src"] |
|
a.attrib["src"]=filename |
|
for a in tree.cssselect("a[href]") : |
|
ou=a.attrib["href"] |
|
if ou.find("http")==0 : |
|
d=detrack(a.attrib["href"]) |
|
if d != a.attrib["href"] : |
|
logger.debug("Detracked %s -> %s" % (a.attrib["href"],d)) |
|
a.attrib["data-orgiginal-href"]=a.attrib["href"] |
|
try : |
|
a.attrib["href"]=d |
|
except Exception, e : |
|
a.attrib["href"]="Error %s" % repr(e) |
|
a.attrib["target"]="_blank" |
|
return tree |
|
|
|
|
|
def run(url,directory) : |
|
v_directory=directory % { "hash" : md5.md5(url).hexdigest() } |
|
if v_directory != directory : |
|
if os.path.exists(v_directory) : |
|
print "directory %s exists. %s not downloaded" % (v_directory,url) |
|
sys.exit(255) |
|
else : |
|
directory=v_directory |
|
print download_file(url |
|
,directory=directory |
|
,callback=process) |
|
|
|
|
|
if __name__=="__main__" : |
|
if len(sys.argv)>1 : |
|
run(sys.argv[1],sys.argv[2]) |
|
logging.debug("%s copied." % sys.argv[1]) |
|
else : |
|
print """ %s URL DIRECTORY [debug] |
|
|
|
HTML Sanitizing for Email Newsletters |
|
|
|
--- copies URL and all referenced <img> file into one directory, changes <img src> attribute |
|
--- checks all <a href> to see if they produce a 301 and change the href attribute accordingly |
|
--- old attribute values are preserved in data-original-* attributes |
|
--- certain unnecesary HTML elements, whose XPATHs are listed in the array called empty, are removed |
|
--- Text is edited according to the replace array, which lists triples of (xpath,search,replace) |
|
--- |
|
--- If DIRECTORY contains the replacement string %(hash)s, this part of the DIRECTORY will be |
|
--- replaced by a MD5 hash of the URL, and the program will exit if this DIRECTORY already exists. |
|
--- |
|
|
|
debug - if present - will lead to copious output. |
|
|
|
ToDo |
|
|
|
Copy CSS as well? Look for images in CSS? |
|
Allow "overwriting" of changed files when downloading the same URL twice |
|
|
|
""" % sys.argv[0] |
|
|