oliversinden/gist:ef6052faff3e5932a055

## gistfile1.py
import os
import mechanize
from mechanize import Browser
from datetime import datetime, date, time
import itertools
import csv
import pdb
import gzip
import StringIO
import urllib
import HTMLParser
import string
from lxml import etree
import datetime
import pdb
import time


extractingtoday = False

#extractdate= "2014m06d19"


oneday = datetime.timedelta(days=1)
d = date(2014,10,8)

br = Browser()

br.add_password("https://ws.webtrends.com", "[username]", "[password]")

br.addheaders = [('Accept-Encoding', 'gzip'),('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'), ('Cache-Control', 'max-age=0')]


while extractingtoday == False:

	print d
	dt = d.timetuple()
	extractdate = str(dt[0]) + "m" + str(dt[1]) + "d" + str(dt[2])


	QueryString = ("[querystring]" + extractdate + "&end_period=" + extractdate +"&period_type=agg&measures=0*1&format=html&suppress_error_codes=true")

	#pdb.set_trace()


##stra debug

	query = None
	while query is None:
		try:
			query = br.open(QueryString)
		except:
			pass


	br.response().get_data()
	compressedstream = StringIO.StringIO(query.read())

	gzipper = gzip.GzipFile(fileobj = compressedstream)

	unzgipdata = gzipper.read()


	parser = etree.XMLParser(recover=True)

	data = unzgipdata.decode('utf-8')

	tree = etree.fromstring(data.encode('utf-8'), parser=parser)

	output_data = []

	rowcount = 0

	for elem_row in tree[1].findall('tr'):

		if rowcount > 1 :

			row = []

			for elem_cell in elem_row.findall('td'):
				if elem_cell.text is None:
					 row.append(elem_cell.text)
				else:
					row.append(elem_cell.text.encode('utf-8'))
			output_data.append(row)
		rowcount = rowcount + 1


	br.close

	with open(extractdate+'.csv', 'wb') as csvfile:
		csvwriter  = csv.writer(csvfile, dialect='excel')

		for t in output_data:
			csvwriter.writerow(t)

	##edn debug

	d = d + oneday

	if d == date.today():
		extractingtoday = True
	#time.sleep(40)


#soup.p jumps to the first <P> tag inside a document, wherever it is. soup.table.tr.td jumps to the first column of the first row of the first table in the document.

#These members actually alias to the first method, covered below. I mention it here because the alias makes it very easy to zoom in on an interesting part of a well-known parse tree.

#An alternate form of this idiom lets you access the first <FOO> tag as .fooTag instead of .foo. For instance, soup.table.tr.td could also be expressed as soup.tableTag.trTag.tdTag, or even soup.tableTag.tr.tdTag. This is useful if you like to be more explicit about what you're doing, or if you're parsing XML whose tag names conflict with the names of Beautiful Soup methods and members.
	import os
	import mechanize
	from mechanize import Browser
	from datetime import datetime, date, time
	import itertools
	import csv
	import pdb
	import gzip
	import StringIO
	import urllib
	import HTMLParser
	import string
	from lxml import etree
	import datetime
	import pdb
	import time


	extractingtoday = False

	#extractdate= "2014m06d19"


	oneday = datetime.timedelta(days=1)
	d = date(2014,10,8)

	br = Browser()

	br.add_password("https://ws.webtrends.com", "[username]", "[password]")

	br.addheaders = [('Accept-Encoding', 'gzip'),('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'),('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'), ('Cache-Control', 'max-age=0')]


	while extractingtoday == False:

	print d
	dt = d.timetuple()
	extractdate = str(dt[0]) + "m" + str(dt[1]) + "d" + str(dt[2])



	QueryString = ("[querystring]" + extractdate + "&end_period=" + extractdate +"&period_type=agg&measures=0*1&format=html&suppress_error_codes=true")

	#pdb.set_trace()



	##stra debug

	query = None
	while query is None:
	try:
	query = br.open(QueryString)
	except:
	pass



	br.response().get_data()
	compressedstream = StringIO.StringIO(query.read())

	gzipper = gzip.GzipFile(fileobj = compressedstream)

	unzgipdata = gzipper.read()



	parser = etree.XMLParser(recover=True)

	data = unzgipdata.decode('utf-8')

	tree = etree.fromstring(data.encode('utf-8'), parser=parser)

	output_data = []

	rowcount = 0

	for elem_row in tree[1].findall('tr'):

	if rowcount > 1 :

	row = []

	for elem_cell in elem_row.findall('td'):
	if elem_cell.text is None:
	row.append(elem_cell.text)
	else:
	row.append(elem_cell.text.encode('utf-8'))
	output_data.append(row)
	rowcount = rowcount + 1






	br.close

	with open(extractdate+'.csv', 'wb') as csvfile:
	csvwriter = csv.writer(csvfile, dialect='excel')

	for t in output_data:
	csvwriter.writerow(t)

	##edn debug

	d = d + oneday

	if d == date.today():
	extractingtoday = True
	#time.sleep(40)




	#soup.p jumps to the first <P> tag inside a document, wherever it is. soup.table.tr.td jumps to the first column of the first row of the first table in the document.

	#These members actually alias to the first method, covered below. I mention it here because the alias makes it very easy to zoom in on an interesting part of a well-known parse tree.

	#An alternate form of this idiom lets you access the first <FOO> tag as .fooTag instead of .foo. For instance, soup.table.tr.td could also be expressed as soup.tableTag.trTag.tdTag, or even soup.tableTag.tr.tdTag. This is useful if you like to be more explicit about what you're doing, or if you're parsing XML whose tag names conflict with the names of Beautiful Soup methods and members.