Skip to content

Instantly share code, notes, and snippets.

@mvtango
Last active November 25, 2016 19:04
Show Gist options
  • Save mvtango/7938de08be297fd19a08 to your computer and use it in GitHub Desktop.
Save mvtango/7938de08be297fd19a08 to your computer and use it in GitHub Desktop.
HTML Sanitizing for Email Newsletters
data-directory/

HTML Sanitizing for Email Newsletters

Usage: mailmirror.py URL DIRECTORY [debug]

  • copies URL and all referenced img files into DIRECTORY, making sure filenames are unique
  • changes img src attribute accordingly,
  • checks all a href to see if they produce a 301 and change the href attribute accordingly
  • old attribute values are preserved in data-original-* attributes

debug - if present - will lead to copious output.

ToDo

Copy CSS as well? Look for images in CSS?

Depends on

  • lxml
  • requests

Install with

pip install lxml requests

or similar.

#! /usr/bin/python
# coding: utf-8
# In[9]:
import requests
import lxml
from lxml import etree
from lxml import html
import os
import logging
import sys
import md5
import re
import datetime
if "debug" in sys.argv :
logging.basicConfig(level=logging.DEBUG,stream=sys.stderr)
else :
logging.basicConfig(filename="/dev/null")
logger=logging.getLogger("")
def download_file(url,directory="./",callback=False) :
""" puts file into directory. processes HTML with callback if present.
chooses unique filename. the filename is returned
will not work for HTML>1024kb """
local_filename = url.split('/')[-1]
if not os.path.exists(directory) :
os.makedirs(directory)
dfile=os.path.join(directory,local_filename)
v=0
while os.path.exists(dfile) :
v=v+1
ds=os.path.splitext(local_filename)
dfile=os.path.join(directory, "%s.%s%s" % (ds[0],v,ds[1]))
logger.debug("Getting %s -> %s" % (url,dfile))
r = requests.get(url,stream=True,verify=False) # verify=False hat ngertz 'rausgefunden!
with open(dfile, 'wb') as f :
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk: # filter out keep-alive new chunks
if r.headers["content-type"].find("html")>-1 and callable(callback) :
tree=html.fromstring(chunk)
chunk=etree.tostring(callback(tree,directory=directory),method="html")
f.write(chunk)
f.close()
return dfile
# In[10]:
def detrack(url) :
r=requests.get(url,allow_redirects=False,verify=False) # verify siehe oben
if r.status_code>299 and r.status_code<400 :
return r.headers["location"]
else :
return url
def makedate(match) :
g=match.groups()
if datetime.datetime.now().hour<9 :
td=(datetime.datetime.strptime(g[1],"%Y-%m-%d")-datetime.timedelta(days=1)).strftime("%Y-%m-%d")
else :
td=g[1]
return g[0]+td
empty=["body/table/tr/td/table/tr/td/table/tbody/tr[1]", # Darstellungs-Hinweis
"body/table/tr/td/table/tr/td/table/tbody/tr[6]", # EMail des Empfängers
"body/table/tr/td/table/tr/td/table/tbody/tr[5]", # "Newsletter vom xxxx-xx-xx"
]
replace=[("head/title","Versicherungsmonitor:","Newsletter:"),
# ("body/table/tr/td/table/tr/td/table/tbody/tr[5]",re.compile(r"(Newsletter vom )(\d\d\d\d-\d\d-\d\d)"),makedate),
]
def process(tree,directory="./") :
for e in empty :
ee=tree.findall(e)
c=0
tt=[]
if ee :
for rre in ee :
tt.append(etree.tostring(rre))
rre.getparent().remove(rre)
c=c+1
logger.debug("%s - %s removed: %s " % (e,c,",".join([repr(a) for a in tt])))
for (xp,sr,rp) in replace :
ee=tree.findall(xp)
if ee :
for reg in ee :
if hasattr(sr,"sub") :
t=sr.sub(rp,repr(reg.text_content()))
else :
t=reg.text_content().replace(sr,rp)
if (t!=reg.text_content()) :
logging.debug("Replaced %s -> %s in %s" % (reg.text_content(),t,xp))
reg.text=t
for a in tree.cssselect("img") :
fn=re.sub("#.*$","",a.attrib["src"])
filename=os.path.split(download_file(fn,directory=directory))[1]
a.attrib["data-original-src"]=a.attrib["src"]
a.attrib["src"]=filename
for a in tree.cssselect("a[href]") :
ou=a.attrib["href"]
if ou.find("http")==0 :
d=detrack(a.attrib["href"])
if d != a.attrib["href"] :
logger.debug("Detracked %s -> %s" % (a.attrib["href"],d))
a.attrib["data-orgiginal-href"]=a.attrib["href"]
try :
a.attrib["href"]=d
except Exception, e :
a.attrib["href"]="Error %s" % repr(e)
a.attrib["target"]="_blank"
return tree
def run(url,directory) :
v_directory=directory % { "hash" : md5.md5(url).hexdigest() }
if v_directory != directory :
if os.path.exists(v_directory) :
print "directory %s exists. %s not downloaded" % (v_directory,url)
sys.exit(255)
else :
directory=v_directory
print download_file(url
,directory=directory
,callback=process)
if __name__=="__main__" :
if len(sys.argv)>1 :
run(sys.argv[1],sys.argv[2])
logging.debug("%s copied." % sys.argv[1])
else :
print """ %s URL DIRECTORY [debug]
HTML Sanitizing for Email Newsletters
--- copies URL and all referenced <img> file into one directory, changes <img src> attribute
--- checks all <a href> to see if they produce a 301 and change the href attribute accordingly
--- old attribute values are preserved in data-original-* attributes
--- certain unnecesary HTML elements, whose XPATHs are listed in the array called empty, are removed
--- Text is edited according to the replace array, which lists triples of (xpath,search,replace)
---
--- If DIRECTORY contains the replacement string %(hash)s, this part of the DIRECTORY will be
--- replaced by a MD5 hash of the URL, and the program will exit if this DIRECTORY already exists.
---
debug - if present - will lead to copious output.
ToDo
Copy CSS as well? Look for images in CSS?
Allow "overwriting" of changed files when downloading the same URL twice
""" % sys.argv[0]
#! /bin/bash
rm -rf data-directory/*
./mailmirror.py http://tools.emailsys.net/mailing/101/472828/2109456/6vd9l7/index.html data-directory debug
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment