Created
May 27, 2012 02:39
-
-
Save amanahuja/2796956 to your computer and use it in GitHub Desktop.
Load and prepare data on Consumer Electronics sales and corresponding Google Search queries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Thu May 22 20:30:36 2012 | |
http://www.meetup.com/r-enthusiasts/events/65306492/ | |
Mirroring the work that we do in Python. | |
This is the code to import the sales and query data into a Py-Pandas | |
dataframe (with conversion to time series). | |
Author (twitter): @amanqa | |
""" | |
import os | |
os.chdir("c:\TEMP\REnthusiasts\GoogleTrends-Code") | |
import pandas | |
from datetime import datetime | |
''' | |
Load first dataframe with Google Trends data | |
''' | |
df1 = pandas.read_csv("googletrends.csv") | |
df1 = df1.rename(columns = { | |
'Week': 'date', | |
'Consumer Electronics': 'query_index' | |
}, copy= False) | |
#Convert type of query volume index to float | |
df1['query_index'] = df1['query_index'].apply( | |
lambda ss: float(ss.replace('%', ''))) | |
#Convert to datetime | |
df1['date'] = df1['date'].apply( | |
lambda ss: ss.split(' - ')[0].split('-')) | |
df1['date'] = df1['date'].apply( | |
lambda ss: datetime(int(ss[0]),int(ss[1]),int(ss[2]))) | |
''' | |
Load second dataframe with sales data | |
''' | |
df2 = pandas.read_csv("sales.csv") | |
theyear = 2012 | |
for idx, mm in enumerate(df2['month']): | |
if mm.startswith('January'): theyear -= 1 | |
df2['month'][idx] += ' ' + str(theyear) | |
df2['month'] = df2['month'].apply( | |
lambda ss: datetime.strptime(ss, '%B %d %Y')) | |
df2 = df2.rename(columns = { | |
'month': 'date', | |
'Sales': 'sales', | |
}, copy = False) | |
''' | |
Merge the dataframes. | |
''' | |
df = pandas.merge(df1, df2, on='date', how='outer') | |
''' | |
Set df index to the 'date' column, and now we have a | |
time series dataframe ready for analysis. | |
''' | |
ts = df.set_index('date') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment