Skip to content

Instantly share code, notes, and snippets.

@billinkc
Last active February 24, 2022 21:08
Show Gist options
  • Save billinkc/ad2259ec2fe200795b95dfa2a5f051b9 to your computer and use it in GitHub Desktop.
Save billinkc/ad2259ec2fe200795b95dfa2a5f051b9 to your computer and use it in GitHub Desktop.
Quick and dirty sample of how we go about automating google analytics pulls
# pip install google-api-python-client
# pip install oauth2client
# pip install pandas
from datetime import datetime, timedelta, date
from dateutil.rrule import rrule, DAILY
import apiclient
from oauth2client.service_account import ServiceAccountCredentials
import json
import pandas as pd
import os
import sys
# Lazy globals
scopes = ['https://www.googleapis.com/auth/analytics.readonly']
key_file_location = "./seekrits.json"
##################################################################################
# Handle spinning up the google analtyics bits
credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_location, scopes)
# Build the service object.
analytics = apiclient.discovery.build('analyticsreporting', 'v4', credentials=credentials)
# A list of the view properties we're interested in
views_list = [1,2,3,4,5]
def parse_response(response):
"""Parses the Analytics Reporting API V4 response into a dataframe.
:param response: An Analytics Reporting API V4 response.
:return: A parsed dataframe
"""
df_response = pd.DataFrame()
_d = {}
_i = 0
for report in response.get('reports', []):
column_header = report.get('columnHeader', {})
dimension_headers = column_header.get('dimensions', [])
metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', [])
for row in report.get('data', {}).get('rows', []):
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
result_dict = {}
for header, dimension in zip(dimension_headers, dimensions):
result_dict[header] = dimension
for i, values in enumerate(dateRangeValues):
for metric_header, value in zip(metric_headers, values.get('values')):
result_dict[metric_header.get('name')] = value
# Append is slow, especially for large dfs
# https://stackoverflow.com/a/50105723/181965
#df_response = df_response.append(result_dict, ignore_index = True)
_d[_i] = result_dict
_i = _i + 1
df_response = pd.DataFrame.from_dict(_d, "index")
return(df_response)
def get_data(processing_date):
"""
We only ever pull one day at a time, this allows us to get one extra dimension into the list
"""
date_string = datetime.strftime(processing_date, '%Y-%m-%d')
# Point to a valid base location for stashing out the response data
# You'll want to do this to save on api calls if you ever need to reprocess data
_basePath = './'
# df_results will contain all the intermediate results
df_results = pd.DataFrame()
for view_id in views_list:
_body={
'reportRequests': [
{
'pageSize': 100000,
"hideTotals": True,
"hideValueRanges": True,
'viewId': str(view_id),
'dateRanges': [{'startDate': date_string, 'endDate': date_string}],
'metrics': [{'expression': 'ga:sessions'},{'expression': 'ga:pageviews'},{'expression': 'ga:newusers'}],
'dimensions': [
{'name': 'ga:regionid'},
{'name': 'ga:userType'},
{'name': 'ga:CountryIsoCode'},
{'name': 'ga:regionIsoCode'}
]
# Demo dimension filter - yet another way you can slip an dimension into the extract
#,"dimensionFilterClauses": [
# {
# "filters": [{"dimension_name": "ga:countryIsoCode","operator": "EXACT","expressions": ["GB"]}]
# }
#]
}
]
}
_raw = analytics.reports().batchGet(body = _body).execute()
# Dump the raw data out in case you ever need to work with again
_file_name = os.path.join(_basePath, f'sample_data_{view_id}_{date_string}.json')
with open(_file_name, 'w', encoding='utf-8') as f:
json.dump(_raw, f, ensure_ascii=False)
_df = parse_response(_raw)
# The first time through, we'll have nothing in the results data frame
if df_results.empty:
df_results = _df
else:
df_results = pd.concat([df_results, _df], ignore_index=True)
return df_results
def main(argv):
days_back = 7
# in normal execution, end date is yesterday and begin date is probably 7 days before that
# We keep processing older data until we have isDataGolden element in the data
end_date = date.today() - timedelta(days = 1)
begin_date = end_date - timedelta(days = days_back)
for current_date in rrule(DAILY, dtstart=begin_date, until=end_date):
print (current_date.strftime("%Y-%m-%d"), ' @ ', datetime.now().strftime("%Y-%m-%d %H:%M"))
_z = get_data(current_date)
print(_z)
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment