mems/har2maff.py

## har2maff.py
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
from bs4 import BeautifulSoup
import hashlib
import logging
import json
import os
import sys
import re
import time
import urlparse
import zipfile

INDEX_RDF = '''<?xml version="1.0" encoding="UTF-8"?>
<RDF:RDF xmlns:MAF="http://maf.mozdev.org/metadata/rdf#"
         xmlns:NC="http://home.netscape.com/NC-rdf#"
         xmlns:RDF="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
  <RDF:Description RDF:about="urn:root">
    <MAF:originalurl RDF:resource="%(url)s"/>
    <MAF:title RDF:resource="%(title)s"/>
    <MAF:archivetime RDF:resource="%(time)s"/>
    <MAF:indexfilename RDF:resource="index.html"/>
    <MAF:charset RDF:resource="UTF-8"/>
  </RDF:Description>
</RDF:RDF>
'''


def main(argv):
    log = logging.getLogger('har_maff')
    logging.basicConfig(level=logging.INFO)

    filename = argv[1]
    maff_fn = os.path.join(os.path.dirname(filename),
                           re.sub(r'\.har$', '.maff', os.path.basename(filename)))
    print('Saving to %s' % maff_fn)

    maff_fh = open(maff_fn, 'wb')
    zf = zipfile.ZipFile(maff_fh, mode='w')

    with open(filename, 'rb') as fh:
        har_body = fh.read()

    har = json.loads(har_body, encoding='utf-8')
    har_log = har['log']
    har_version = har_log['version']
    if '1.2' != har_version:
        log.error('I only support version 1.2, not %s' % har_version)
        return 1
    har_pages = har_log['pages']
    if not har_pages:
        log.error('Har has no "pages", that is fatal')
        return 1
    page0 = har_pages[0]

    ## this is in full ISO8601, including millis which
    ## python 2.7 does not support
    started_dt = re.sub(r'\.\d+Z$', 'Z', page0['startedDateTime'])

    save_time = time.strptime(started_dt, '%Y-%m-%dT%H:%M:%SZ')
    rdf_time = time.strftime('%a, %d %b %Y %H:%M:%S -0000', save_time)
    out_dir = '%s' % int(time.mktime(save_time))
    start_page = page0['title']  # yup, "title"
    entries = har_log['entries']

    page_title = None
    #: :type: dict[unicode, unicode]
    mime_types = {}
    for en in entries:
        req = en['request']
        req_url = req['url']
        resp = en['response']
        contents = resp['content']
        #: :type: unicode
        media_type = contents['mimeType']
        mime_types[req_url] = media_type

    for en in entries:
        req = en['request']
        req_method = req['method']
        req_url = req['url']
        resp = en['response']
        contents = resp['content']
        media_type = mime_types[req_url]

        if 'GET' != req_method:
            log.warn('Skipping non-GET url: %s \"%s\"' % (req_method, req_url))
            continue
        if start_page == req_url:
            out_fn = 'index.html'
        else:
            out_fn = hashlib.md5(req_url.encode('utf-8')).hexdigest()
            if 'image/gif' in media_type:
                out_fn = '%s.gif' % out_fn
            elif 'image/jpeg' in media_type:
                out_fn = '%s.jpeg' % out_fn
            elif 'image/png' in media_type:
                out_fn = '%s.png' % out_fn
            elif '/javascript' in media_type:
                out_fn = '%s.js' % out_fn
            elif 'text/css' in media_type:
                out_fn = '%s.css' % out_fn

        if 'text' not in contents:
            continue
        #: :type: unicode
        text_content = contents['text']

        if start_page == req_url:
            soup = BeautifulSoup(text_content)
            page_title = soup.select('title')[0].text

            def make_re(linky):
                #: :type: unicode
                safe_link = re.escape(linky)
                # BS does not allow us to know if the href contained the "&amp" or not
                # so here we update the regex to permit either
                link_re = re.compile(safe_link.replace('\&', '\&(?:amp;)?'))
                return link_re

            def update_entries_to_req_url(to_url):
                """
                Finds the request URL in the HAR, independent of port number,
                and if they are different from the provided :py:param:`to_url`
                then I will update the **global** `entries` dict.

                Turns out, HAR does not store the **accurate** URL.
                For example, ``https://example.com:443/``
                is stored in the har as ``https://example.com/``

                :param unicode to_url: the URL used in the document
                :return: the URL as stored in the HAR
                :rtype: unicode | None
                """
                result = None
                urlp = urlparse.urlparse(to_url)
                ## this is, after all, the whole problem here
                # noinspection PyProtectedMember
                urlp = urlp._replace(netloc=re.sub(r':\d+', '', urlp.netloc))
                for url2 in mime_types.iterkeys():
                    url2p = urlparse.urlparse(url2)
                    # noinspection PyProtectedMember
                    url2p = url2p._replace(netloc=re.sub(r':\d+', '', url2p.netloc))
                    if urlp == url2p:
                        log.debug('matched "%s" and "%s" ...', to_url, url2)
                        result = url2
                        if url2 != to_url:
                            for en2 in entries:
                                if url2 == en2['request']['url']:
                                    mime_types[to_url] = mime_types[url2]
                                    en2['request']['url'] = to_url
                                    log.warn('Replaced "%s" with "%s" because HAR was wrong', url2, to_url)
                        break
                return result

            for css in soup.select('link[rel=stylesheet]'):
                css_href = css.attrs.get('href')
                update_entries_to_req_url(css_href)
                css_href2 = '%s.css' % hashlib.md5(css_href).hexdigest()
                log.debug('replacing css href %s => %s', css_href, css_href2)
                text_content = make_re(css_href).sub(css_href2, text_content)

            for js in soup.select('script[src]'):
                js_href = js.attrs.get('src')
                update_entries_to_req_url(js_href)
                js_href2 = '%s.js' % hashlib.md5(js_href).hexdigest()
                log.debug('replacing js src %s => %s' % (js_href, js_href2))
                text_content = make_re(js_href).sub(js_href2, text_content)

            for img in soup.select('img[src]'):
                img_href = img.attrs.get('src')
                # we need the HAR url in order to look up the URL
                # in the mime-types dict
                har_url = update_entries_to_req_url(img_href)
                ## turns out, the .har doesn't capture *every* <img>
                if not har_url:
                    log.debug('Skipping non-HAR img.src "%s"', img_href)
                    continue
                img_mt = mime_types.get(har_url)
                if 'image/png' in img_mt:
                    img_ext = 'png'
                elif 'image/jpeg' in img_mt:
                    img_ext = 'jpeg'
                elif 'image/gif' in img_mt:
                    img_ext = 'gif'
                else:
                    log.error('Unrecognized img media type: %s for %s', img_mt, img_href)
                    img_ext = ''
                img_href2 = '%s.%s' % (hashlib.md5(img_href).hexdigest(), img_ext)
                log.debug('replacing img src %s => %s' % (img_href, img_href2))
                text_content = make_re(img_href).sub(img_href2, text_content)

        encoding = contents.get('encoding')
        if 'base64' == encoding:
            the_bytes = text_content.decode('base64')
        elif encoding is None:
            the_bytes = text_content.encode('utf-8')
        else:
            log.error('Unrecognized response encoding: %s', encoding)
            the_bytes = ''
        log.debug('URL:"%s" => "%s"' % (req_url, out_fn))
        zf.writestr(os.path.join(out_dir, out_fn), the_bytes)

    rdf = INDEX_RDF % {
        'url': re.sub(re.escape('&'), '&amp;', start_page),
        'title': page_title,
        'time': rdf_time,
    }
    zf.writestr(os.path.join(out_dir, 'index.rdf'), rdf.encode('utf-8'))
    zf.close()
    return 0

if __name__ == '__main__':
    sys.exit(main(sys.argv))
	#! /usr/bin/env python2.7
	# -- coding: utf-8 --
	from __future__ import print_function, unicode_literals
	from bs4 import BeautifulSoup
	import hashlib
	import logging
	import json
	import os
	import sys
	import re
	import time
	import urlparse
	import zipfile

	INDEX_RDF = '''<?xml version="1.0" encoding="UTF-8"?>
	<RDF:RDF xmlns:MAF="http://maf.mozdev.org/metadata/rdf#"
	xmlns:NC="http://home.netscape.com/NC-rdf#"
	xmlns:RDF="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
	<RDF:Description RDF:about="urn:root">
	<MAF:originalurl RDF:resource="%(url)s"/>
	<MAF:title RDF:resource="%(title)s"/>
	<MAF:archivetime RDF:resource="%(time)s"/>
	<MAF:indexfilename RDF:resource="index.html"/>
	<MAF:charset RDF:resource="UTF-8"/>
	</RDF:Description>
	</RDF:RDF>
	'''


	def main(argv):
	log = logging.getLogger('har_maff')
	logging.basicConfig(level=logging.INFO)

	filename = argv[1]
	maff_fn = os.path.join(os.path.dirname(filename),
	re.sub(r'\.har$', '.maff', os.path.basename(filename)))
	print('Saving to %s' % maff_fn)

	maff_fh = open(maff_fn, 'wb')
	zf = zipfile.ZipFile(maff_fh, mode='w')

	with open(filename, 'rb') as fh:
	har_body = fh.read()

	har = json.loads(har_body, encoding='utf-8')
	har_log = har['log']
	har_version = har_log['version']
	if '1.2' != har_version:
	log.error('I only support version 1.2, not %s' % har_version)
	return 1
	har_pages = har_log['pages']
	if not har_pages:
	log.error('Har has no "pages", that is fatal')
	return 1
	page0 = har_pages[0]

	## this is in full ISO8601, including millis which
	## python 2.7 does not support
	started_dt = re.sub(r'\.\d+Z$', 'Z', page0['startedDateTime'])

	save_time = time.strptime(started_dt, '%Y-%m-%dT%H:%M:%SZ')
	rdf_time = time.strftime('%a, %d %b %Y %H:%M:%S -0000', save_time)
	out_dir = '%s' % int(time.mktime(save_time))
	start_page = page0['title'] # yup, "title"
	entries = har_log['entries']

	page_title = None
	#: :type: dict[unicode, unicode]
	mime_types = {}
	for en in entries:
	req = en['request']
	req_url = req['url']
	resp = en['response']
	contents = resp['content']
	#: :type: unicode
	media_type = contents['mimeType']
	mime_types[req_url] = media_type

	for en in entries:
	req = en['request']
	req_method = req['method']
	req_url = req['url']
	resp = en['response']
	contents = resp['content']
	media_type = mime_types[req_url]

	if 'GET' != req_method:
	log.warn('Skipping non-GET url: %s \"%s\"' % (req_method, req_url))
	continue
	if start_page == req_url:
	out_fn = 'index.html'
	else:
	out_fn = hashlib.md5(req_url.encode('utf-8')).hexdigest()
	if 'image/gif' in media_type:
	out_fn = '%s.gif' % out_fn
	elif 'image/jpeg' in media_type:
	out_fn = '%s.jpeg' % out_fn
	elif 'image/png' in media_type:
	out_fn = '%s.png' % out_fn
	elif '/javascript' in media_type:
	out_fn = '%s.js' % out_fn
	elif 'text/css' in media_type:
	out_fn = '%s.css' % out_fn

	if 'text' not in contents:
	continue
	#: :type: unicode
	text_content = contents['text']

	if start_page == req_url:
	soup = BeautifulSoup(text_content)
	page_title = soup.select('title')[0].text

	def make_re(linky):
	#: :type: unicode
	safe_link = re.escape(linky)
	# BS does not allow us to know if the href contained the "&amp" or not
	# so here we update the regex to permit either
	link_re = re.compile(safe_link.replace('\&', '\&(?:amp;)?'))
	return link_re

	def update_entries_to_req_url(to_url):
	"""
	Finds the request URL in the HAR, independent of port number,
	and if they are different from the provided :py:param:`to_url`
	then I will update the global `entries` dict.

	Turns out, HAR does not store the accurate URL.
	For example, ``https://example.com:443/``
	is stored in the har as ``https://example.com/``

	:param unicode to_url: the URL used in the document
	:return: the URL as stored in the HAR
	:rtype: unicode \| None
	"""
	result = None
	urlp = urlparse.urlparse(to_url)
	## this is, after all, the whole problem here
	# noinspection PyProtectedMember
	urlp = urlp._replace(netloc=re.sub(r':\d+', '', urlp.netloc))
	for url2 in mime_types.iterkeys():
	url2p = urlparse.urlparse(url2)
	# noinspection PyProtectedMember
	url2p = url2p._replace(netloc=re.sub(r':\d+', '', url2p.netloc))
	if urlp == url2p:
	log.debug('matched "%s" and "%s" ...', to_url, url2)
	result = url2
	if url2 != to_url:
	for en2 in entries:
	if url2 == en2['request']['url']:
	mime_types[to_url] = mime_types[url2]
	en2['request']['url'] = to_url
	log.warn('Replaced "%s" with "%s" because HAR was wrong', url2, to_url)
	break
	return result

	for css in soup.select('link[rel=stylesheet]'):
	css_href = css.attrs.get('href')
	update_entries_to_req_url(css_href)
	css_href2 = '%s.css' % hashlib.md5(css_href).hexdigest()
	log.debug('replacing css href %s => %s', css_href, css_href2)
	text_content = make_re(css_href).sub(css_href2, text_content)

	for js in soup.select('script[src]'):
	js_href = js.attrs.get('src')
	update_entries_to_req_url(js_href)
	js_href2 = '%s.js' % hashlib.md5(js_href).hexdigest()
	log.debug('replacing js src %s => %s' % (js_href, js_href2))
	text_content = make_re(js_href).sub(js_href2, text_content)

	for img in soup.select('img[src]'):
	img_href = img.attrs.get('src')
	# we need the HAR url in order to look up the URL
	# in the mime-types dict
	har_url = update_entries_to_req_url(img_href)
	## turns out, the .har doesn't capture every <img>
	if not har_url:
	log.debug('Skipping non-HAR img.src "%s"', img_href)
	continue
	img_mt = mime_types.get(har_url)
	if 'image/png' in img_mt:
	img_ext = 'png'
	elif 'image/jpeg' in img_mt:
	img_ext = 'jpeg'
	elif 'image/gif' in img_mt:
	img_ext = 'gif'
	else:
	log.error('Unrecognized img media type: %s for %s', img_mt, img_href)
	img_ext = ''
	img_href2 = '%s.%s' % (hashlib.md5(img_href).hexdigest(), img_ext)
	log.debug('replacing img src %s => %s' % (img_href, img_href2))
	text_content = make_re(img_href).sub(img_href2, text_content)

	encoding = contents.get('encoding')
	if 'base64' == encoding:
	the_bytes = text_content.decode('base64')
	elif encoding is None:
	the_bytes = text_content.encode('utf-8')
	else:
	log.error('Unrecognized response encoding: %s', encoding)
	the_bytes = ''
	log.debug('URL:"%s" => "%s"' % (req_url, out_fn))
	zf.writestr(os.path.join(out_dir, out_fn), the_bytes)

	rdf = INDEX_RDF % {
	'url': re.sub(re.escape('&'), '&', start_page),
	'title': page_title,
	'time': rdf_time,
	}
	zf.writestr(os.path.join(out_dir, 'index.rdf'), rdf.encode('utf-8'))
	zf.close()
	return 0

	if __name__ == '__main__':
	sys.exit(main(sys.argv))