Last active
February 24, 2022 21:08
-
-
Save billinkc/ad2259ec2fe200795b95dfa2a5f051b9 to your computer and use it in GitHub Desktop.
Quick and dirty sample of how we go about automating google analytics pulls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install google-api-python-client | |
# pip install oauth2client | |
# pip install pandas | |
from datetime import datetime, timedelta, date | |
from dateutil.rrule import rrule, DAILY | |
import apiclient | |
from oauth2client.service_account import ServiceAccountCredentials | |
import json | |
import pandas as pd | |
import os | |
import sys | |
# Lazy globals | |
scopes = ['https://www.googleapis.com/auth/analytics.readonly'] | |
key_file_location = "./seekrits.json" | |
################################################################################## | |
# Handle spinning up the google analtyics bits | |
credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_location, scopes) | |
# Build the service object. | |
analytics = apiclient.discovery.build('analyticsreporting', 'v4', credentials=credentials) | |
# A list of the view properties we're interested in | |
views_list = [1,2,3,4,5] | |
def parse_response(response): | |
"""Parses the Analytics Reporting API V4 response into a dataframe. | |
:param response: An Analytics Reporting API V4 response. | |
:return: A parsed dataframe | |
""" | |
df_response = pd.DataFrame() | |
_d = {} | |
_i = 0 | |
for report in response.get('reports', []): | |
column_header = report.get('columnHeader', {}) | |
dimension_headers = column_header.get('dimensions', []) | |
metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', []) | |
for row in report.get('data', {}).get('rows', []): | |
dimensions = row.get('dimensions', []) | |
dateRangeValues = row.get('metrics', []) | |
result_dict = {} | |
for header, dimension in zip(dimension_headers, dimensions): | |
result_dict[header] = dimension | |
for i, values in enumerate(dateRangeValues): | |
for metric_header, value in zip(metric_headers, values.get('values')): | |
result_dict[metric_header.get('name')] = value | |
# Append is slow, especially for large dfs | |
# https://stackoverflow.com/a/50105723/181965 | |
#df_response = df_response.append(result_dict, ignore_index = True) | |
_d[_i] = result_dict | |
_i = _i + 1 | |
df_response = pd.DataFrame.from_dict(_d, "index") | |
return(df_response) | |
def get_data(processing_date): | |
""" | |
We only ever pull one day at a time, this allows us to get one extra dimension into the list | |
""" | |
date_string = datetime.strftime(processing_date, '%Y-%m-%d') | |
# Point to a valid base location for stashing out the response data | |
# You'll want to do this to save on api calls if you ever need to reprocess data | |
_basePath = './' | |
# df_results will contain all the intermediate results | |
df_results = pd.DataFrame() | |
for view_id in views_list: | |
_body={ | |
'reportRequests': [ | |
{ | |
'pageSize': 100000, | |
"hideTotals": True, | |
"hideValueRanges": True, | |
'viewId': str(view_id), | |
'dateRanges': [{'startDate': date_string, 'endDate': date_string}], | |
'metrics': [{'expression': 'ga:sessions'},{'expression': 'ga:pageviews'},{'expression': 'ga:newusers'}], | |
'dimensions': [ | |
{'name': 'ga:regionid'}, | |
{'name': 'ga:userType'}, | |
{'name': 'ga:CountryIsoCode'}, | |
{'name': 'ga:regionIsoCode'} | |
] | |
# Demo dimension filter - yet another way you can slip an dimension into the extract | |
#,"dimensionFilterClauses": [ | |
# { | |
# "filters": [{"dimension_name": "ga:countryIsoCode","operator": "EXACT","expressions": ["GB"]}] | |
# } | |
#] | |
} | |
] | |
} | |
_raw = analytics.reports().batchGet(body = _body).execute() | |
# Dump the raw data out in case you ever need to work with again | |
_file_name = os.path.join(_basePath, f'sample_data_{view_id}_{date_string}.json') | |
with open(_file_name, 'w', encoding='utf-8') as f: | |
json.dump(_raw, f, ensure_ascii=False) | |
_df = parse_response(_raw) | |
# The first time through, we'll have nothing in the results data frame | |
if df_results.empty: | |
df_results = _df | |
else: | |
df_results = pd.concat([df_results, _df], ignore_index=True) | |
return df_results | |
def main(argv): | |
days_back = 7 | |
# in normal execution, end date is yesterday and begin date is probably 7 days before that | |
# We keep processing older data until we have isDataGolden element in the data | |
end_date = date.today() - timedelta(days = 1) | |
begin_date = end_date - timedelta(days = days_back) | |
for current_date in rrule(DAILY, dtstart=begin_date, until=end_date): | |
print (current_date.strftime("%Y-%m-%d"), ' @ ', datetime.now().strftime("%Y-%m-%d %H:%M")) | |
_z = get_data(current_date) | |
print(_z) | |
if __name__ == "__main__": | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment