Revisions

  • 8a27c8 Mon Jun 15 19:13:48 -0700 2009
gist: 130483 Download_button fork
public
Public Clone URL: git://gist.github.com/130483.git
Embed All Files: show embed
hive_monthly_trend.sql #
1
FROM new_daily_timelines ndt MAP ndt.page_id, ndt.dates, ndt.pageviews, ndt.total_pageviews USING 'python hive_monthly_trend_mapper.py' AS page_id, total_pageviews, monthly_trend
hive_monthly_trend_mapper.py #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env python
# encoding: utf-8
"""
hive_monthly_trend_mapper.py
 
Python Hadoop Streaming script called by Hive
in daily run - calculates simple baseline
monthly trend for "Biggest Movers"
 
Created by Peter Skomoroch on 2009-06-10.
Copyright (c) 2009 Data Wrangling LLC. All rights reserved.
"""
 
import sys, os, re
import simplejson
 
def calc_trend(dates, pageviews):
  dts,counts = zip( *sorted( zip (dates,pageviews)))
  trend_2 = sum(counts[-15:])
  trend_1 = 1.0*sum(counts[-30:-15])
  monthly_trend = trend_2 - trend_1
  return monthly_trend
 
for line in sys.stdin:
  (page_id, dates, pageviews, total_pageviews) = line.strip().split("\t")
  dates = simplejson.loads(dates)
  pageviews = simplejson.loads(pageviews)
  try:
    monthly_trend = calc_trend(dates, pageviews)
  except:
    # skip bad rows
    monthly_trend = 0
  sys.stdout.write('%s\t%s\t%s\n' % (page_id, total_pageviews, monthly_trend))