Skip to content

Instantly share code, notes, and snippets.

@oliversinden
Created October 9, 2014 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oliversinden/ef6052faff3e5932a055 to your computer and use it in GitHub Desktop.
Save oliversinden/ef6052faff3e5932a055 to your computer and use it in GitHub Desktop.
Webtrends API extraction
import os
import mechanize
from mechanize import Browser
from datetime import datetime, date, time
import itertools
import csv
import pdb
import gzip
import StringIO
import urllib
import HTMLParser
import string
from lxml import etree
import datetime
import pdb
import time
extractingtoday = False
#extractdate= "2014m06d19"
oneday = datetime.timedelta(days=1)
d = date(2014,10,8)
br = Browser()
br.add_password("https://ws.webtrends.com", "[username]", "[password]")
br.addheaders = [('Accept-Encoding', 'gzip'),('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'), ('Cache-Control', 'max-age=0')]
while extractingtoday == False:
print d
dt = d.timetuple()
extractdate = str(dt[0]) + "m" + str(dt[1]) + "d" + str(dt[2])
QueryString = ("[querystring]" + extractdate + "&end_period=" + extractdate +"&period_type=agg&measures=0*1&format=html&suppress_error_codes=true")
#pdb.set_trace()
##stra debug
query = None
while query is None:
try:
query = br.open(QueryString)
except:
pass
br.response().get_data()
compressedstream = StringIO.StringIO(query.read())
gzipper = gzip.GzipFile(fileobj = compressedstream)
unzgipdata = gzipper.read()
parser = etree.XMLParser(recover=True)
data = unzgipdata.decode('utf-8')
tree = etree.fromstring(data.encode('utf-8'), parser=parser)
output_data = []
rowcount = 0
for elem_row in tree[1].findall('tr'):
if rowcount > 1 :
row = []
for elem_cell in elem_row.findall('td'):
if elem_cell.text is None:
row.append(elem_cell.text)
else:
row.append(elem_cell.text.encode('utf-8'))
output_data.append(row)
rowcount = rowcount + 1
br.close
with open(extractdate+'.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, dialect='excel')
for t in output_data:
csvwriter.writerow(t)
##edn debug
d = d + oneday
if d == date.today():
extractingtoday = True
#time.sleep(40)
#soup.p jumps to the first <P> tag inside a document, wherever it is. soup.table.tr.td jumps to the first column of the first row of the first table in the document.
#These members actually alias to the first method, covered below. I mention it here because the alias makes it very easy to zoom in on an interesting part of a well-known parse tree.
#An alternate form of this idiom lets you access the first <FOO> tag as .fooTag instead of .foo. For instance, soup.table.tr.td could also be expressed as soup.tableTag.trTag.tdTag, or even soup.tableTag.tr.tdTag. This is useful if you like to be more explicit about what you're doing, or if you're parsing XML whose tag names conflict with the names of Beautiful Soup methods and members.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment