mvtango/.gitignore

## .gitignore
data-directory/

## README.md

      
    Raw
  

              README.md
            
          
    HTML Sanitizing for Email Newsletters

Usage:  mailmirror.py URL DIRECTORY [debug]

copies URL and all referenced img files into DIRECTORY, making sure filenames are unique
changes img src attribute accordingly,
checks all a href to see if they produce a 301 and change the href attribute accordingly
old attribute values are preserved in data-original-* attributes

debug - if present - will lead to copious output.
ToDo

Copy CSS as well? Look for images in CSS?
Depends on


lxml
requests

Install with

pip install lxml requests

or similar.

  
## mailmirror.py
#! /usr/bin/python
# coding: utf-8

# In[9]:

import requests
import lxml
from lxml import etree
from lxml import html
import os
import logging
import sys
import md5
import re
import datetime

if "debug" in sys.argv :
	logging.basicConfig(level=logging.DEBUG,stream=sys.stderr)
else :
	logging.basicConfig(filename="/dev/null")


logger=logging.getLogger("")


def download_file(url,directory="./",callback=False) :
    """ puts file into directory. processes HTML with callback if present.
        chooses unique filename. the filename is returned
	will not work for HTML>1024kb """
    local_filename = url.split('/')[-1]
    if not os.path.exists(directory) :
        os.makedirs(directory)
    dfile=os.path.join(directory,local_filename)
    v=0
    while os.path.exists(dfile) :
        v=v+1
        ds=os.path.splitext(local_filename)
        dfile=os.path.join(directory, "%s.%s%s" % (ds[0],v,ds[1]))
    logger.debug("Getting %s -> %s" % (url,dfile))
    r = requests.get(url,stream=True,verify=False) # verify=False hat ngertz 'rausgefunden!
    with open(dfile, 'wb') as f :
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk: # filter out keep-alive new chunks
                if r.headers["content-type"].find("html")>-1 and callable(callback) :
                    tree=html.fromstring(chunk)
                    chunk=etree.tostring(callback(tree,directory=directory),method="html")
                f.write(chunk)
        f.close()
    return dfile


# In[10]:

def detrack(url) :
    r=requests.get(url,allow_redirects=False,verify=False) # verify siehe oben
    if r.status_code>299 and r.status_code<400 :
        return r.headers["location"]
    else :
        return url

def makedate(match) :
	g=match.groups()
	if datetime.datetime.now().hour<9 :
		td=(datetime.datetime.strptime(g[1],"%Y-%m-%d")-datetime.timedelta(days=1)).strftime("%Y-%m-%d")
	else :
		td=g[1]
	return g[0]+td


empty=["body/table/tr/td/table/tr/td/table/tbody/tr[1]", # Darstellungs-Hinweis
       "body/table/tr/td/table/tr/td/table/tbody/tr[6]", # EMail des Empfängers
       "body/table/tr/td/table/tr/td/table/tbody/tr[5]", # "Newsletter vom xxxx-xx-xx"
      ]

replace=[("head/title","Versicherungsmonitor:","Newsletter:"),
         # ("body/table/tr/td/table/tr/td/table/tbody/tr[5]",re.compile(r"(Newsletter vom )(\d\d\d\d-\d\d-\d\d)"),makedate),
		]


def process(tree,directory="./") :
    for e in empty :
		ee=tree.findall(e)
		c=0
		tt=[]
		if ee :
			for rre in ee :
				tt.append(etree.tostring(rre))
				rre.getparent().remove(rre)
				c=c+1
			logger.debug("%s - %s removed: %s " % (e,c,",".join([repr(a) for a in tt])))
    for (xp,sr,rp) in replace :
		ee=tree.findall(xp)
		if ee :
			for reg in ee :
				if hasattr(sr,"sub") :
					t=sr.sub(rp,repr(reg.text_content()))
				else :
					t=reg.text_content().replace(sr,rp)
				if (t!=reg.text_content()) :
					logging.debug("Replaced %s -> %s in %s" % (reg.text_content(),t,xp))
					reg.text=t
    for a in tree.cssselect("img") :
	fn=re.sub("#.*$","",a.attrib["src"])
        filename=os.path.split(download_file(fn,directory=directory))[1]
        a.attrib["data-original-src"]=a.attrib["src"]
        a.attrib["src"]=filename
    for a in tree.cssselect("a[href]") :
        ou=a.attrib["href"]
        if ou.find("http")==0 :
            d=detrack(a.attrib["href"])
            if d != a.attrib["href"] :
                logger.debug("Detracked %s -> %s" % (a.attrib["href"],d))
                a.attrib["data-orgiginal-href"]=a.attrib["href"]
		try :
                	a.attrib["href"]=d
		except Exception, e :
			a.attrib["href"]="Error %s" % repr(e)
		a.attrib["target"]="_blank"
    return tree


def run(url,directory) :
    v_directory=directory % { "hash" : md5.md5(url).hexdigest() }
    if v_directory != directory :
        if os.path.exists(v_directory) :
            print "directory %s exists. %s not downloaded" % (v_directory,url)
            sys.exit(255)
        else :
            directory=v_directory
    print download_file(url
             ,directory=directory
             ,callback=process)


if __name__=="__main__" :
	if len(sys.argv)>1 :
    		run(sys.argv[1],sys.argv[2])
		logging.debug("%s copied." % sys.argv[1])
	else :
		print """ %s URL DIRECTORY [debug]

HTML Sanitizing for Email Newsletters

--- copies URL and all referenced <img> file into one directory, changes <img src> attribute
--- checks all <a href> to see if they produce a 301 and change the href attribute accordingly
--- old attribute values are preserved in data-original-* attributes
--- certain unnecesary HTML elements, whose XPATHs are listed in the array called empty, are removed
--- Text is edited according to the replace array, which lists triples of (xpath,search,replace)
---
--- If DIRECTORY contains the replacement string %(hash)s, this part of the DIRECTORY will be
--- replaced by a MD5 hash of the URL, and the program will exit if this DIRECTORY already exists.
---

debug - if present - will lead to copious output.

ToDo

Copy CSS as well? Look for images in CSS?
Allow "overwriting" of changed files when downloading the same URL twice

""" % sys.argv[0]


## test.sh
#! /bin/bash
rm -rf data-directory/*
./mailmirror.py http://tools.emailsys.net/mailing/101/472828/2109456/6vd9l7/index.html data-directory debug
	#! /usr/bin/python
	# coding: utf-8

	# In[9]:

	import requests
	import lxml
	from lxml import etree
	from lxml import html
	import os
	import logging
	import sys
	import md5
	import re
	import datetime

	if "debug" in sys.argv :
	logging.basicConfig(level=logging.DEBUG,stream=sys.stderr)
	else :
	logging.basicConfig(filename="/dev/null")


	logger=logging.getLogger("")


	def download_file(url,directory="./",callback=False) :
	""" puts file into directory. processes HTML with callback if present.
	chooses unique filename. the filename is returned
	will not work for HTML>1024kb """
	local_filename = url.split('/')[-1]
	if not os.path.exists(directory) :
	os.makedirs(directory)
	dfile=os.path.join(directory,local_filename)
	v=0
	while os.path.exists(dfile) :
	v=v+1
	ds=os.path.splitext(local_filename)
	dfile=os.path.join(directory, "%s.%s%s" % (ds[0],v,ds[1]))
	logger.debug("Getting %s -> %s" % (url,dfile))
	r = requests.get(url,stream=True,verify=False) # verify=False hat ngertz 'rausgefunden!
	with open(dfile, 'wb') as f :
	for chunk in r.iter_content(chunk_size=1024 * 1024):
	if chunk: # filter out keep-alive new chunks
	if r.headers["content-type"].find("html")>-1 and callable(callback) :
	tree=html.fromstring(chunk)
	chunk=etree.tostring(callback(tree,directory=directory),method="html")
	f.write(chunk)
	f.close()
	return dfile



	# In[10]:

	def detrack(url) :
	r=requests.get(url,allow_redirects=False,verify=False) # verify siehe oben
	if r.status_code>299 and r.status_code<400 :
	return r.headers["location"]
	else :
	return url

	def makedate(match) :
	g=match.groups()
	if datetime.datetime.now().hour<9 :
	td=(datetime.datetime.strptime(g[1],"%Y-%m-%d")-datetime.timedelta(days=1)).strftime("%Y-%m-%d")
	else :
	td=g[1]
	return g[0]+td


	empty=["body/table/tr/td/table/tr/td/table/tbody/tr[1]", # Darstellungs-Hinweis
	"body/table/tr/td/table/tr/td/table/tbody/tr[6]", # EMail des Empfängers
	"body/table/tr/td/table/tr/td/table/tbody/tr[5]", # "Newsletter vom xxxx-xx-xx"
	]

	replace=[("head/title","Versicherungsmonitor:","Newsletter:"),
	# ("body/table/tr/td/table/tr/td/table/tbody/tr[5]",re.compile(r"(Newsletter vom )(\d\d\d\d-\d\d-\d\d)"),makedate),
	]



	def process(tree,directory="./") :
	for e in empty :
	ee=tree.findall(e)
	c=0
	tt=[]
	if ee :
	for rre in ee :
	tt.append(etree.tostring(rre))
	rre.getparent().remove(rre)
	c=c+1
	logger.debug("%s - %s removed: %s " % (e,c,",".join([repr(a) for a in tt])))
	for (xp,sr,rp) in replace :
	ee=tree.findall(xp)
	if ee :
	for reg in ee :
	if hasattr(sr,"sub") :
	t=sr.sub(rp,repr(reg.text_content()))
	else :
	t=reg.text_content().replace(sr,rp)
	if (t!=reg.text_content()) :
	logging.debug("Replaced %s -> %s in %s" % (reg.text_content(),t,xp))
	reg.text=t
	for a in tree.cssselect("img") :
	fn=re.sub("#.*$","",a.attrib["src"])
	filename=os.path.split(download_file(fn,directory=directory))[1]
	a.attrib["data-original-src"]=a.attrib["src"]
	a.attrib["src"]=filename
	for a in tree.cssselect("a[href]") :
	ou=a.attrib["href"]
	if ou.find("http")==0 :
	d=detrack(a.attrib["href"])
	if d != a.attrib["href"] :
	logger.debug("Detracked %s -> %s" % (a.attrib["href"],d))
	a.attrib["data-orgiginal-href"]=a.attrib["href"]
	try :
	a.attrib["href"]=d
	except Exception, e :
	a.attrib["href"]="Error %s" % repr(e)
	a.attrib["target"]="_blank"
	return tree


	def run(url,directory) :
	v_directory=directory % { "hash" : md5.md5(url).hexdigest() }
	if v_directory != directory :
	if os.path.exists(v_directory) :
	print "directory %s exists. %s not downloaded" % (v_directory,url)
	sys.exit(255)
	else :
	directory=v_directory
	print download_file(url
	,directory=directory
	,callback=process)


	if __name__=="__main__" :
	if len(sys.argv)>1 :
	run(sys.argv[1],sys.argv[2])
	logging.debug("%s copied." % sys.argv[1])
	else :
	print """ %s URL DIRECTORY [debug]

	HTML Sanitizing for Email Newsletters

	--- copies URL and all referenced <img> file into one directory, changes <img src> attribute
	--- checks all <a href> to see if they produce a 301 and change the href attribute accordingly
	--- old attribute values are preserved in data-original-* attributes
	--- certain unnecesary HTML elements, whose XPATHs are listed in the array called empty, are removed
	--- Text is edited according to the replace array, which lists triples of (xpath,search,replace)
	---
	--- If DIRECTORY contains the replacement string %(hash)s, this part of the DIRECTORY will be
	--- replaced by a MD5 hash of the URL, and the program will exit if this DIRECTORY already exists.
	---

	debug - if present - will lead to copious output.

	ToDo

	Copy CSS as well? Look for images in CSS?
	Allow "overwriting" of changed files when downloading the same URL twice

	""" % sys.argv[0]
	#! /bin/bash
	rm -rf data-directory/*
	./mailmirror.py http://tools.emailsys.net/mailing/101/472828/2109456/6vd9l7/index.html data-directory debug