philz (owner)

Revisions

gist: 131367 Download_button fork
public
Public Clone URL: git://gist.github.com/131367.git
Embed All Files: show embed
Text only #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python
#
# Python code for http://www.cloudera.com/blog/2009/06/17/analyzing-apache-logs-with-piganalyzing-apache-logs-with-pig/
 
import sys
import math
 
def rescale(values, low=0, high=4095):
  """Linearly rescales values to be strictly between low and high."""
  maxval = max(values)
  minval = min(values)
  scale = float(high-low)/float(maxval-minval)
  return [ low + scale*(x - minval) for x in values ]
 
def encode(i):
  """
Implements the "extended encoding" at
http://code.google.com/apis/chart/formats.html#extended
"""
  code = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-."
  i = int(i)
  assert len(code) == 64
  assert i >= 0
  assert i < 64*64
  return "%s%s" % (code[i/64], code[i%64])
 
def build_map_url(keys, values, area="world"):
  """
Builds a Google chart API map URL.
 
Keys are country or state codes within area, and values
should be values that can be encoded.
"""
  # See http://code.google.com/apis/chart/types.html#maps
  return "http://chart.apis.google.com/chart?cht=t&chs=440x220&chco=FFFFFF,FFFFFF,148BCF" + \
         "&chtm=%s&chld=%s&chd=e:%s" % (area, "".join(keys), "".join(map(encode, values)))
 
def print_url(header, url):
  print "%s: <img src='%s'><br>" % (header, url)
 
def read_tsv(filename):
  return map(lambda line: line.rstrip().split("\t"), file(filename))
 
def main():
  # Read by_country data, filter out null countries, and transpose it.
  lines = read_tsv("by_country.tsv")
  lines = filter(lambda x: x[0], lines)
  countries, hits, bytes = zip(*lines)
  # Take the log of the data and rescale.
  hits = rescale(map(math.log, map(float, hits)))
  bytes = rescale(map(math.log, map(float, bytes)))
  print_url("Bytes by country:", build_map_url(countries, bytes))
  print_url("Hits by country:", build_map_url(countries, hits))
 
  # Read by_state (US data), filter out the null, and transpose
  lines = read_tsv("by_state.tsv")
  lines = filter(lambda x: x[0], lines)
  states, hits, bytes = zip(*lines)
  # Take logs and rescale
  hits = rescale(map(math.log, map(float, hits)))
  bytes = rescale(map(math.log, map(float, bytes)))
  print_url("Bytes by state: ", build_map_url(states, bytes, "usa"))
  print_url("Hits by state: ", build_map_url(states, hits, "usa"))
 
if __name__ == "__main__":
  main()