mdaniel/har_to_webscarab.py

## har_to_webscarab.py
import calendar
import json
import os
import re
import sys
import time
import urllib

class Conversation:
  """
  conversationlog:
    ### Conversation : 1
    RESPONSE_SIZE: 151
    WHEN: 1375942513708
    COOKIE: JIMBO=/
    METHOD: GET
    STATUS: 200 OK
    URL: http://atesis.local:80/
    ORIGIN: Proxy
  cookies:
    ### Cookie : atesis.local/ JIMBO
    1375942925153 JIMBO=/; Domain=atesis.local; Path=/

  urlinfo
    ### URL : http://atesis.local:80/
    METHODS: GET
    SIGNATURE: GET http://atesis.local:80/ (null)
    STATUS: 200 OK
    CHECKSUM: 5b6d74f1453e20c09d6a20d909779ad7

    ### URL : http://atesis.local:80/fred/
    REFERER: http://atesis.local:80/
  fragments/
  conversations/
    %d-request / %d-response
  """
  def __init__(self, num, entry):
    self.num = 1 + num
    self.entry = entry
    self.req = entry['request']
    self.resp = entry['response']
    ## this is causing some kind of Scarab collision on load so just omit it
    self.want_urlinfo = False

  def write(self):
    if 1 == self.num:
      if not os.path.exists('conversations'):
        os.mkdir('conversations')

    have_content = 'text' in self.resp['content']

    if self.want_urlinfo:
      urlinfo = open('urlinfo', 'a')
      print >> urlinfo, '### URL : %s\n' % self.req['url'],
      print >> urlinfo, 'METHODS: %s\n' % self.req['method'],
      print >> urlinfo, 'STATUS: %d %s\n' % ( \
          self.resp['status'], self.resp['statusText'] ),
      print >> urlinfo, 'SIGNATURE: %s %s (null)\n' % (
          self.req['method'], self.req['url']),
      print >> urlinfo, '\n',
      urlinfo.close()

    c_log = open('conversationlog', 'a')
    print >> c_log, '### Conversation : %d\n' % self.num,
    resp_size = self.resp['content']['size']
    if not have_content:
      resp_size = 0
    print >> c_log, 'RESPONSE_SIZE: %d\n' % resp_size,
    del resp_size
    when = calendar.timegm( \
        time.strptime( \
        re.sub(r'Z$', 'UTC', self.entry['startedDateTime']), \
        '%Y-%m-%dT%H:%M:%S.%f%Z') )
    # it wants java millis, which is annoying because
    # strptime was told about the fractional seconds
    when = when * 1000
    print >> c_log, 'WHEN: %d\n' % when,
    print >> c_log, 'METHOD: %s\n' % self.req['method'],
    print >> c_log, 'STATUS: %d %s\n' % ( \
        self.resp['status'], self.resp['statusText'] ),
    print >> c_log, 'URL: %s\n' % self.req['url'],
    print >> c_log, '\n',
    c_log.close()

    with open('conversations/%d-request' % self.num, 'wb') as fh:
      # (proto, rest) = urllib.splittype( self.req['url'] )
      # (hostname, uri) = urllib.splithost( rest )
      # del rest
      ## Scarab always thinks it is a proxy request
      print >> fh, '%s %s %s\r\n' % (self.req['method'], \
          self.req['url'], self.req['httpVersion']),
      for h in self.req['headers']:
        print >> fh, '%s: %s\r\n' % (h['name'], h['value']),
      print >> fh, '\r\n',
      if 'postData' in self.req:
        txt = self.req['postData']['text']
        assert len(txt) == self.req['bodySize'],\
            'postData.text[%d] != bodySize[%d] for %s' % (
                len(txt), self.req['bodySize'], str(self.entry))
        print >> fh, '%s' % txt

    with open('conversations/%d-response' % self.num, 'wb') as fh:
      print >> fh, '%s %d %s\r\n' % (self.resp['httpVersion'], \
          self.resp['status'], self.resp['statusText']),
      for h in self.resp['headers']:
        h_name = h['name']
        h_val = h['value']
        if 'content-length' == h_name.lower() and not have_content:
          # pretend the server didn't provide content, since we don't have it
          h_val = '0'
        ## eat the C-E if we don't have any content
        ## actually, just eat the C-E and T-E all the time, because har
        ## doesn't encode that way but Scarab will try to interpret them
        if 'content-encoding' == h_name.lower() or\
          'transfer-encoding' == h_name.lower():
          continue
        print >> fh, '%s: %s\r\n' % (h_name, h_val),
        del h_name, h_val
      print >> fh, '\r\n',
      if have_content:
        txt = self.resp['content']['text']
        mime_type = self.resp['content']['mimeType']
        if mime_type.lower().startswith('image/'):
          txt = txt.decode('base64')
        else:
          txt = txt.encode('utf-8')
        print >> fh, txt,

def main( argv ):
  """
  log
    version : string
    creator
      name : string
      version : string
    pages : list
    entries : list
      request
        method : string
        url : string
        httpVersion : string
        headers : list
          name : string
          value : string
        queryString : list
        cookies : list
        bodySize : number
      response
        status : number
        statusText : string
        httpVersion : string
        headers : list
        cookies : list
        content
          size : number
          compression : number
        redirectURL : string
        headerSize : number
        bodySize : number
  """
  har_filename = sys.argv[1]
  with open( har_filename, 'rb' ) as fh:
    har = json.load( fh )
  # "page" is a destination that the user saw
  # "entry" is something Chrome loaded

  # pg_list = har['log']['pages']
  # for pg in pg_list: print 'PAGE', pg['title']

  # for e in entries: print 'URL', e['request']['url']

  entries = har['log']['entries']
  for x in xrange(0, len(entries)):
    conv = Conversation(x, entries[x])
    conv.write()
    del conv

if __name__ == '__main__':
  main( sys.argv )
	import calendar
	import json
	import os
	import re
	import sys
	import time
	import urllib

	class Conversation:
	"""
	conversationlog:
	### Conversation : 1
	RESPONSE_SIZE: 151
	WHEN: 1375942513708
	COOKIE: JIMBO=/
	METHOD: GET
	STATUS: 200 OK
	URL: http://atesis.local:80/
	ORIGIN: Proxy
	cookies:
	### Cookie : atesis.local/ JIMBO
	1375942925153 JIMBO=/; Domain=atesis.local; Path=/

	urlinfo
	### URL : http://atesis.local:80/
	METHODS: GET
	SIGNATURE: GET http://atesis.local:80/ (null)
	STATUS: 200 OK
	CHECKSUM: 5b6d74f1453e20c09d6a20d909779ad7

	### URL : http://atesis.local:80/fred/
	REFERER: http://atesis.local:80/
	fragments/
	conversations/
	%d-request / %d-response
	"""
	def __init__(self, num, entry):
	self.num = 1 + num
	self.entry = entry
	self.req = entry['request']
	self.resp = entry['response']
	## this is causing some kind of Scarab collision on load so just omit it
	self.want_urlinfo = False

	def write(self):
	if 1 == self.num:
	if not os.path.exists('conversations'):
	os.mkdir('conversations')

	have_content = 'text' in self.resp['content']

	if self.want_urlinfo:
	urlinfo = open('urlinfo', 'a')
	print >> urlinfo, '### URL : %s\n' % self.req['url'],
	print >> urlinfo, 'METHODS: %s\n' % self.req['method'],
	print >> urlinfo, 'STATUS: %d %s\n' % ( \
	self.resp['status'], self.resp['statusText'] ),
	print >> urlinfo, 'SIGNATURE: %s %s (null)\n' % (
	self.req['method'], self.req['url']),
	print >> urlinfo, '\n',
	urlinfo.close()

	c_log = open('conversationlog', 'a')
	print >> c_log, '### Conversation : %d\n' % self.num,
	resp_size = self.resp['content']['size']
	if not have_content:
	resp_size = 0
	print >> c_log, 'RESPONSE_SIZE: %d\n' % resp_size,
	del resp_size
	when = calendar.timegm( \
	time.strptime( \
	re.sub(r'Z$', 'UTC', self.entry['startedDateTime']), \
	'%Y-%m-%dT%H:%M:%S.%f%Z') )
	# it wants java millis, which is annoying because
	# strptime was told about the fractional seconds
	when = when * 1000
	print >> c_log, 'WHEN: %d\n' % when,
	print >> c_log, 'METHOD: %s\n' % self.req['method'],
	print >> c_log, 'STATUS: %d %s\n' % ( \
	self.resp['status'], self.resp['statusText'] ),
	print >> c_log, 'URL: %s\n' % self.req['url'],
	print >> c_log, '\n',
	c_log.close()

	with open('conversations/%d-request' % self.num, 'wb') as fh:
	# (proto, rest) = urllib.splittype( self.req['url'] )
	# (hostname, uri) = urllib.splithost( rest )
	# del rest
	## Scarab always thinks it is a proxy request
	print >> fh, '%s %s %s\r\n' % (self.req['method'], \
	self.req['url'], self.req['httpVersion']),
	for h in self.req['headers']:
	print >> fh, '%s: %s\r\n' % (h['name'], h['value']),
	print >> fh, '\r\n',
	if 'postData' in self.req:
	txt = self.req['postData']['text']
	assert len(txt) == self.req['bodySize'],\
	'postData.text[%d] != bodySize[%d] for %s' % (
	len(txt), self.req['bodySize'], str(self.entry))
	print >> fh, '%s' % txt

	with open('conversations/%d-response' % self.num, 'wb') as fh:
	print >> fh, '%s %d %s\r\n' % (self.resp['httpVersion'], \
	self.resp['status'], self.resp['statusText']),
	for h in self.resp['headers']:
	h_name = h['name']
	h_val = h['value']
	if 'content-length' == h_name.lower() and not have_content:
	# pretend the server didn't provide content, since we don't have it
	h_val = '0'
	## eat the C-E if we don't have any content
	## actually, just eat the C-E and T-E all the time, because har
	## doesn't encode that way but Scarab will try to interpret them
	if 'content-encoding' == h_name.lower() or\
	'transfer-encoding' == h_name.lower():
	continue
	print >> fh, '%s: %s\r\n' % (h_name, h_val),
	del h_name, h_val
	print >> fh, '\r\n',
	if have_content:
	txt = self.resp['content']['text']
	mime_type = self.resp['content']['mimeType']
	if mime_type.lower().startswith('image/'):
	txt = txt.decode('base64')
	else:
	txt = txt.encode('utf-8')
	print >> fh, txt,

	def main( argv ):
	"""
	log
	version : string
	creator
	name : string
	version : string
	pages : list
	entries : list
	request
	method : string
	url : string
	httpVersion : string
	headers : list
	name : string
	value : string
	queryString : list
	cookies : list
	bodySize : number
	response
	status : number
	statusText : string
	httpVersion : string
	headers : list
	cookies : list
	content
	size : number
	compression : number
	redirectURL : string
	headerSize : number
	bodySize : number
	"""
	har_filename = sys.argv[1]
	with open( har_filename, 'rb' ) as fh:
	har = json.load( fh )
	# "page" is a destination that the user saw
	# "entry" is something Chrome loaded

	# pg_list = har['log']['pages']
	# for pg in pg_list: print 'PAGE', pg['title']

	# for e in entries: print 'URL', e['request']['url']

	entries = har['log']['entries']
	for x in xrange(0, len(entries)):
	conv = Conversation(x, entries[x])
	conv.write()
	del conv

	if __name__ == '__main__':
	main( sys.argv )