billinkc/ForTJay.py

## ForTJay.py
# pip install google-api-python-client
# pip install oauth2client
# pip install pandas

from datetime import datetime, timedelta, date
from dateutil.rrule import rrule, DAILY
import apiclient
from oauth2client.service_account import ServiceAccountCredentials
import json
import pandas as pd
import os
import sys

# Lazy globals
scopes = ['https://www.googleapis.com/auth/analytics.readonly']
key_file_location = "./seekrits.json"

##################################################################################
# Handle spinning up the google analtyics bits
credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_location, scopes)

# Build the service object.
analytics = apiclient.discovery.build('analyticsreporting', 'v4', credentials=credentials)

# A list of the view properties we're interested in
views_list = [1,2,3,4,5]


def parse_response(response):
    """Parses the Analytics Reporting API V4 response into a dataframe.

    :param response: An Analytics Reporting API V4 response.
    :return: A parsed dataframe
    """
    df_response = pd.DataFrame()
    _d = {}
    _i = 0

    for report in response.get('reports', []):
        column_header = report.get('columnHeader', {})
        dimension_headers = column_header.get('dimensions', [])
        metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', [])

        for row in report.get('data', {}).get('rows', []):
            dimensions = row.get('dimensions', [])
            dateRangeValues = row.get('metrics', [])

            result_dict = {}
            for header, dimension in zip(dimension_headers, dimensions):
                result_dict[header] = dimension

            for i, values in enumerate(dateRangeValues):
                for metric_header, value in zip(metric_headers, values.get('values')):
                    result_dict[metric_header.get('name')] = value

            # Append is slow, especially for large dfs
            # https://stackoverflow.com/a/50105723/181965
            #df_response = df_response.append(result_dict, ignore_index = True)
            _d[_i] = result_dict
            _i = _i + 1

    df_response = pd.DataFrame.from_dict(_d, "index")
    return(df_response)

def get_data(processing_date):
    """
    We only ever pull one day at a time, this allows us to get one extra dimension into the list
    """
    date_string = datetime.strftime(processing_date, '%Y-%m-%d')

    # Point to a valid base location for stashing out the response data
    # You'll want to do this to save on api calls if you ever need to reprocess data
    _basePath = './'

    # df_results will contain all the intermediate results
    df_results = pd.DataFrame()
    for view_id in views_list:
        _body={
                'reportRequests': [
                    {
                        'pageSize': 100000,
                        "hideTotals": True,
                        "hideValueRanges": True,
                        'viewId': str(view_id),
                        'dateRanges': [{'startDate': date_string, 'endDate': date_string}],
                        'metrics': [{'expression': 'ga:sessions'},{'expression': 'ga:pageviews'},{'expression': 'ga:newusers'}],
                        'dimensions': [
                            {'name': 'ga:regionid'},
                            {'name': 'ga:userType'},
                            {'name': 'ga:CountryIsoCode'},
                            {'name': 'ga:regionIsoCode'}
                        ]
                        # Demo dimension filter - yet another way you can slip an dimension into the extract
                        #,"dimensionFilterClauses": [
                        #    {
                        #        "filters": [{"dimension_name": "ga:countryIsoCode","operator": "EXACT","expressions": ["GB"]}]
                        #    }
                        #]
                    }
                ]
            }

        _raw = analytics.reports().batchGet(body = _body).execute()
        # Dump the raw data out in case you ever need to work with again
        _file_name = os.path.join(_basePath, f'sample_data_{view_id}_{date_string}.json')
        with open(_file_name, 'w', encoding='utf-8') as f:
            json.dump(_raw, f, ensure_ascii=False)

        _df = parse_response(_raw)

        # The first time through, we'll have nothing in the results data frame
        if df_results.empty:
            df_results = _df
        else:
            df_results = pd.concat([df_results, _df], ignore_index=True)

    return df_results


def main(argv):

    days_back = 7

    # in normal execution, end date is yesterday and begin date is probably 7 days before that
    # We keep processing older data until we have isDataGolden element in the data
    end_date = date.today() - timedelta(days = 1)
    begin_date = end_date - timedelta(days = days_back)

    for current_date in rrule(DAILY, dtstart=begin_date, until=end_date):
        print (current_date.strftime("%Y-%m-%d"), ' @ ', datetime.now().strftime("%Y-%m-%d %H:%M"))
        _z = get_data(current_date)
        print(_z)


if __name__ == "__main__":
    main(sys.argv)
	# pip install google-api-python-client
	# pip install oauth2client
	# pip install pandas

	from datetime import datetime, timedelta, date
	from dateutil.rrule import rrule, DAILY
	import apiclient
	from oauth2client.service_account import ServiceAccountCredentials
	import json
	import pandas as pd
	import os
	import sys

	# Lazy globals
	scopes = ['https://www.googleapis.com/auth/analytics.readonly']
	key_file_location = "./seekrits.json"

	##################################################################################
	# Handle spinning up the google analtyics bits
	credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_location, scopes)

	# Build the service object.
	analytics = apiclient.discovery.build('analyticsreporting', 'v4', credentials=credentials)

	# A list of the view properties we're interested in
	views_list = [1,2,3,4,5]


	def parse_response(response):
	"""Parses the Analytics Reporting API V4 response into a dataframe.

	:param response: An Analytics Reporting API V4 response.
	:return: A parsed dataframe
	"""
	df_response = pd.DataFrame()
	_d = {}
	_i = 0

	for report in response.get('reports', []):
	column_header = report.get('columnHeader', {})
	dimension_headers = column_header.get('dimensions', [])
	metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', [])

	for row in report.get('data', {}).get('rows', []):
	dimensions = row.get('dimensions', [])
	dateRangeValues = row.get('metrics', [])

	result_dict = {}
	for header, dimension in zip(dimension_headers, dimensions):
	result_dict[header] = dimension

	for i, values in enumerate(dateRangeValues):
	for metric_header, value in zip(metric_headers, values.get('values')):
	result_dict[metric_header.get('name')] = value

	# Append is slow, especially for large dfs
	# https://stackoverflow.com/a/50105723/181965
	#df_response = df_response.append(result_dict, ignore_index = True)
	_d[_i] = result_dict
	_i = _i + 1

	df_response = pd.DataFrame.from_dict(_d, "index")
	return(df_response)

	def get_data(processing_date):
	"""
	We only ever pull one day at a time, this allows us to get one extra dimension into the list
	"""
	date_string = datetime.strftime(processing_date, '%Y-%m-%d')

	# Point to a valid base location for stashing out the response data
	# You'll want to do this to save on api calls if you ever need to reprocess data
	_basePath = './'

	# df_results will contain all the intermediate results
	df_results = pd.DataFrame()
	for view_id in views_list:
	_body={
	'reportRequests': [
	{
	'pageSize': 100000,
	"hideTotals": True,
	"hideValueRanges": True,
	'viewId': str(view_id),
	'dateRanges': [{'startDate': date_string, 'endDate': date_string}],
	'metrics': [{'expression': 'ga:sessions'},{'expression': 'ga:pageviews'},{'expression': 'ga:newusers'}],
	'dimensions': [
	{'name': 'ga:regionid'},
	{'name': 'ga:userType'},
	{'name': 'ga:CountryIsoCode'},
	{'name': 'ga:regionIsoCode'}
	]
	# Demo dimension filter - yet another way you can slip an dimension into the extract
	#,"dimensionFilterClauses": [
	# {
	# "filters": [{"dimension_name": "ga:countryIsoCode","operator": "EXACT","expressions": ["GB"]}]
	# }
	#]
	}
	]
	}

	_raw = analytics.reports().batchGet(body = _body).execute()
	# Dump the raw data out in case you ever need to work with again
	_file_name = os.path.join(_basePath, f'sample_data_{view_id}_{date_string}.json')
	with open(_file_name, 'w', encoding='utf-8') as f:
	json.dump(_raw, f, ensure_ascii=False)

	_df = parse_response(_raw)

	# The first time through, we'll have nothing in the results data frame
	if df_results.empty:
	df_results = _df
	else:
	df_results = pd.concat([df_results, _df], ignore_index=True)

	return df_results


	def main(argv):

	days_back = 7

	# in normal execution, end date is yesterday and begin date is probably 7 days before that
	# We keep processing older data until we have isDataGolden element in the data
	end_date = date.today() - timedelta(days = 1)
	begin_date = end_date - timedelta(days = days_back)

	for current_date in rrule(DAILY, dtstart=begin_date, until=end_date):
	print (current_date.strftime("%Y-%m-%d"), ' @ ', datetime.now().strftime("%Y-%m-%d %H:%M"))
	_z = get_data(current_date)
	print(_z)


	if __name__ == "__main__":
	main(sys.argv)