Skip to content

Instantly share code, notes, and snippets.

@kemsakurai
Last active September 26, 2018 14:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kemsakurai/e2ae55360a929d7cd4b8ffd7305381d0 to your computer and use it in GitHub Desktop.
Save kemsakurai/e2ae55360a929d7cd4b8ffd7305381d0 to your computer and use it in GitHub Desktop.
Google Analytics のデータをBigQuery に投入する Pythonスクリプト一式

Google Analytics の無料版は、BigQuery への Export機能が使えない。
このため、google2pandas を使って、Google Analytics からデータを取得し、SQLiteへ分割登録、登録結果を結合して CSVにし、それを GCS経由でBigQuery にインポートするスクリプトを作成した。
これらはそれら一式。


使用方法

Google Cloud Platform の設定を一通り実施。
BigQuery のデータセット、GCS にディレクトリを作成。
スクリプト内の設定値を書き換えた上で、以下のようにコマンドを実行する。

python3 ga_to_sqlite.py 2018-03-18
python3 merge_data_to_gcs.py 2018-03-18
python3 load_data_to_bigquery_from_gcs.py 2018-03-18

これで 2018-03-18 のデータが BigQuery に保存される。


ga_to_sqlite.py の アウトプット

  • 1日分のデータ取得で作成されるテーブルの一覧
select name from sqlite_master where type='table' and name like '%20180318%';
  • 結果
"ga_sessions_20180318_USER"
"ga_sessions_20180318_SESSION"
"ga_sessions_20180318_TRAFFIC0"
"ga_sessions_20180318_TRAFFIC1"
"ga_sessions_20180318_TRAFFIC2"
"ga_sessions_20180318_GOAL_CONVERSIONS"
"ga_sessions_20180318_GOAL_CONVERSIONS1"
"ga_sessions_20180318_GOAL_CONVERSIONS2"
"ga_sessions_20180318_GOAL_CONVERSIONS3"
"ga_sessions_20180318_GOAL_CONVERSIONS4"
"ga_sessions_20180318_GOAL_CONVERSIONS5"
"ga_sessions_20180318_PLATFORM_OR_DEVICE0"
"ga_sessions_20180318_PLATFORM_OR_DEVICE1"
"ga_sessions_20180318_PLATFORM_OR_DEVICE2"
"ga_sessions_20180318_GEO_NETWORK0"
"ga_sessions_20180318_GEO_NETWORK1"
"ga_sessions_20180318_GEO_NETWORK2"
"ga_sessions_20180318_GEO_NETWORK3"
"ga_sessions_20180318_SYSTEM0"
"ga_sessions_20180318_SYSTEM1"
"ga_sessions_20180318_PAGE_TRACKING0"
"ga_sessions_20180318_PAGE_TRACKING1"
"ga_sessions_20180318_PAGE_TRACKING2"
"ga_sessions_20180318_PAGE_TRACKING3"
"ga_sessions_20180318_EVENT_TRACKING"
"ga_sessions_20180318_CUSTOM_VARIABLES_OR_COLUMNS0"
"ga_sessions_20180318_CUSTOM_VARIABLES_OR_COLUMNS1"
"ga_sessions_20180318_CUSTOM_VARIABLES_OR_COLUMNS2"
"ga_sessions_20180318_CUSTOM_VARIABLES_OR_COLUMNS3"
"ga_sessions_20180318_CUSTOM_VARIABLES_OR_COLUMNS4"
from google2pandas import *
GA_CLIENT_JSON = './your_client.json'
VIEW_ID = 'YOUR_VIEW_ID'
USER_DIMENSIONS = [{'name': 'ga:userType'},
{'name': 'ga:sessionCount'},
{'name': 'ga:daysSinceLastSession'},
{'name': 'ga:userDefinedValue'},
{'name': 'ga:userBucket'}]
USER_METRICS = [{"expression": "ga:users"},
{"expression": "ga:newUsers"},
{"expression": "ga:percentNewSessions"},
{"expression": "ga:1dayUsers"},
{"expression": "ga:7dayUsers"},
{"expression": "ga:14dayUsers"},
{"expression": "ga:28dayUsers"},
{"expression": "ga:30dayUsers"},
{"expression": "ga:sessionsPerUser"}]
SESSION_DIMENSIONS = [{'name': 'ga:sessionDurationBucket'}]
SESSION_METRICS = [{'expression': 'ga:sessions'},
{'expression': 'ga:bounces'},
{'expression': 'ga:bounceRate'},
{'expression': 'ga:sessionDuration'},
{'expression': 'ga:avgSessionDuration'},
{'expression': 'ga:uniqueDimensionCombinations'},
{'expression': 'ga:hits'}]
TRAFFIC_SOURCES_DIMENSIONS = [{'name': 'ga:referralPath'},
{'name': 'ga:fullReferrer'},
{'name': 'ga:campaign'},
{'name': 'ga:source'},
{'name': 'ga:medium'},
{'name': 'ga:sourceMedium'},
{'name': 'ga:keyword'},
{'name': 'ga:adContent'},
{'name': 'ga:socialNetwork'},
{'name': 'ga:hasSocialSourceReferral'},
{'name': 'ga:campaignCode'}
]
TRAFFIC_SOURCES_METRICS = [{'expression': 'ga:organicSearches'}, ]
# ------------------------------------
# Adwords は Skip
# ------------------------------------
GOAL_CONVERSIONS_DIMENSIONS = [{'name': 'ga:goalCompletionLocation'},
{'name': 'ga:goalPreviousStep1'},
{'name': 'ga:goalPreviousStep2'},
{'name': 'ga:goalPreviousStep3'}, ]
GOAL_CONVERSIONS_METRICS = [{'expression': 'ga:goalStartsAll'},
{'expression': 'ga:goalCompletionsAll'},
{'expression': 'ga:goalValueAll'},
{'expression': 'ga:goalValuePerSession'},
{'expression': 'ga:goalConversionRateAll'},
{'expression': 'ga:goalAbandonsAll'},
{'expression': 'ga:goalAbandonRateAll'}, ]
GOAL_CONVERSIONS_METRICS_1 = [{'expression': 'ga:goal1Starts'},
{'expression': 'ga:goal1Completions'},
{'expression': 'ga:goal1Value'},
{'expression': 'ga:goal1ConversionRate'},
{'expression': 'ga:goal1Abandons'},
{'expression': 'ga:goal1AbandonRate'}, ]
GOAL_CONVERSIONS_METRICS_2 = [{'expression': 'ga:goal2Starts'},
{'expression': 'ga:goal2Completions'},
{'expression': 'ga:goal2Value'},
{'expression': 'ga:goal2ConversionRate'},
{'expression': 'ga:goal2Abandons'},
{'expression': 'ga:goal2AbandonRate'}, ]
GOAL_CONVERSIONS_METRICS_3 = [{'expression': 'ga:goal3Starts'},
{'expression': 'ga:goal3Completions'},
{'expression': 'ga:goal3Value'},
{'expression': 'ga:goal3ConversionRate'},
{'expression': 'ga:goal3Abandons'},
{'expression': 'ga:goal3AbandonRate'}, ]
GOAL_CONVERSIONS_METRICS_4 = [{'expression': 'ga:goal4Starts'},
{'expression': 'ga:goal4Completions'},
{'expression': 'ga:goal4Value'},
{'expression': 'ga:goal4ConversionRate'},
{'expression': 'ga:goal4Abandons'},
{'expression': 'ga:goal4AbandonRate'}, ]
GOAL_CONVERSIONS_METRICS_5 = [{'expression': 'ga:goal5Starts'},
{'expression': 'ga:goal5Completions'},
{'expression': 'ga:goal5Value'},
{'expression': 'ga:goal5ConversionRate'},
{'expression': 'ga:goal5Abandons'},
{'expression': 'ga:goal5AbandonRate'}, ]
PLATFORM_OR_DEVICE_DIMENSIONS = [{'name': 'ga:browser'},
{'name': 'ga:browserVersion'},
{'name': 'ga:operatingSystem'},
{'name': 'ga:operatingSystemVersion'},
{'name': 'ga:mobileDeviceBranding'},
{'name': 'ga:mobileDeviceModel'},
{'name': 'ga:mobileInputSelector'},
{'name': 'ga:mobileDeviceInfo'},
{'name': 'ga:mobileDeviceMarketingName'},
{'name': 'ga:deviceCategory'},
{'name': 'ga:browserSize'},
{'name': 'ga:dataSource'}, ]
GEO_NETWORK_DIMENSIONS = [{'name': 'ga:continent'},
{'name': 'ga:subContinent'},
{'name': 'ga:country'},
{'name': 'ga:region'},
{'name': 'ga:metro'},
{'name': 'ga:city'},
{'name': 'ga:latitude'},
{'name': 'ga:longitude'},
{'name': 'ga:networkDomain'},
{'name': 'ga:networkLocation'},
{'name': 'ga:cityId'},
{'name': 'ga:continentId'},
{'name': 'ga:countryIsoCode'},
{'name': 'ga:metroId'},
{'name': 'ga:regionId'},
{'name': 'ga:regionIsoCode'},
{'name': 'ga:subContinentCode'}, ]
SYSTEM_DIMENSIONS = [
{'name': 'ga:flashVersion'},
{'name': 'ga:javaEnabled'},
{'name': 'ga:language'},
{'name': 'ga:screenColors'},
{'name': 'ga:sourcePropertyDisplayName'},
{'name': 'ga:sourcePropertyTrackingId'},
{'name': 'ga:screenResolution'},
]
PAGE_TRACKING_DIMENSIONS = [
{'name': 'ga:hostname'},
{'name': 'ga:pagePath'},
{'name': 'ga:pagePathLevel1'},
{'name': 'ga:pagePathLevel2'},
{'name': 'ga:pagePathLevel3'},
{'name': 'ga:pagePathLevel4'},
{'name': 'ga:pageTitle'},
{'name': 'ga:landingPagePath'},
{'name': 'ga:secondPagePath'},
{'name': 'ga:exitPagePath'},
{'name': 'ga:previousPagePath'},
{'name': 'ga:pageDepth'}, ]
PAGE_TRACKING_METRICS = [
{'expression': 'ga:pageValue'},
{'expression': 'ga:entrances'},
{'expression': 'ga:entranceRate'},
{'expression': 'ga:pageviews'},
{'expression': 'ga:pageviewsPerSession'},
{'expression': 'ga:uniquePageviews'},
{'expression': 'ga:timeOnPage'},
{'expression': 'ga:avgTimeOnPage'},
{'expression': 'ga:exits'},
{'expression': 'ga:exitRate'}, ]
EVENT_TRACKING_DIMENSIONS = [
{'name': 'ga:eventCategory'},
{'name': 'ga:eventAction'},
{'name': 'ga:eventLabel'}, ]
EVENT_TRACKING_METRICS = [
{'expression': 'ga:totalEvents'},
{'expression': 'ga:uniqueEvents'},
{'expression': 'ga:eventValue'},
{'expression': 'ga:avgEventValue'},
{'expression': 'ga:sessionsWithEvent'},
{'expression': 'ga:eventsPerSessionWithEvent'},
]
CUSTOM_VARIABLES_OR_COLUMNS_DIMENSIONS = [
{'name': 'ga:dimension1'},
{'name': 'ga:dimension2'},
{'name': 'ga:dimension3'},
{'name': 'ga:dimension4'},
{'name': 'ga:dimension5'},
{'name': 'ga:dimension6'},
{'name': 'ga:dimension7'},
# 既に設定済なのでコメントアウト
# {'name': 'ga:dimension8'},
{'name': 'ga:dimension9'},
# 既に設定済なのでコメントアウト
# {'name': 'ga:dimension10'},
{'name': 'ga:dimension11'},
{'name': 'ga:dimension12'},
{'name': 'ga:dimension13'},
{'name': 'ga:dimension14'},
{'name': 'ga:dimension15'},
{'name': 'ga:dimension16'},
{'name': 'ga:dimension17'},
{'name': 'ga:dimension18'},
{'name': 'ga:dimension19'},
# 未設定なのでコメントアウト
# {'name': 'ga:dimension20'},
]
from functools import reduce
def divide_list(xs, n):
q = len(xs) // n
m = len(xs) % n
return reduce(
lambda acc, i:
(lambda fr=sum([len(x) for x in acc]):
acc + [xs[fr:(fr + q + (1 if i < m else 0))]]
)()
,
range(n),
[]
)
def convert_datetime(x):
try:
import datetime
x = int(x) / 1000
return datetime.datetime.fromtimestamp(x);
except TypeError:
return x
except ValueError:
return x
def store_ga_to_sqlite(date, dimensions, metrics, category_name):
query_dimensions = [{'name': 'ga:dimension8'}, {'name': 'ga:dimension10'}]
query_dimensions.extend(dimensions)
query = {
'reportRequests': [{
'viewId': VIEW_ID,
'dateRanges': [{
'startDate': date,
'endDate': date}],
'dimensions': query_dimensions,
'metrics': metrics,
}]
}
# Assume we have placed our client_secrets_v4.json file in the current
# working directory.
try:
conn = GoogleAnalyticsQueryV4(secrets=GA_CLIENT_JSON)
# table名生成
table_name = "ga_sessions_" + date.replace("-", "") + "_" + category_name
df = conn.execute_query(query)
df['dimension10'] = df['dimension10'].apply(lambda x: convert_datetime(x))
except TypeError:
print("TypeError... table_name[", table_name, "]", "create empty table.")
import pandas as pd
columns = []
for i in query_dimensions:
for value in i.values():
columns.append(value.replace("ga:", ""))
for i in metrics:
for value in i.values():
columns.append(value.replace("ga:", ""))
df = pd.DataFrame(columns=columns)
import sqlite3
# db作成
db_name = "ga.db"
# dbコネクト
conn = sqlite3.connect(db_name)
df.to_sql(table_name, conn,
index=False, if_exists="replace")
# コネクションを閉じる
conn.close()
def main():
import argparse
parser = argparse.ArgumentParser(description='ga_hit_data_to_sqlite')
parser.add_argument('date', help='Data range date')
args = parser.parse_args()
print("ga_to_sqlite start. date [", args.date, "]")
# ------------------------------------------
# USER_DIMENSIONS USER_METRICSは指定すると、
# googleapiclient.errors.HttpError: <HttpError 400 when requesting https://analyticsreporting.googleapis.com/v4/reports:batchGet?alt=json returned "Selected dimensions and metrics cannot be queried together.">
# エラーとなる。計算できないようなので設定しない
# ------------------------------------------
store_ga_to_sqlite(args.date, USER_DIMENSIONS, [], "USER")
# ------------------------------------------
# SESSION_DIMENSIONS
# ------------------------------------------
store_ga_to_sqlite(args.date, SESSION_DIMENSIONS, SESSION_METRICS, "SESSION")
# ------------------------------------------
# TRAFFIC_SOURCES_DIMENSIONS
# ------------------------------------------
sublist = divide_list(TRAFFIC_SOURCES_DIMENSIONS, 3)
for i, elem in enumerate(sublist):
store_ga_to_sqlite(args.date, elem, TRAFFIC_SOURCES_METRICS, "TRAFFIC" + str(i))
# ------------------------------------------
# GOAL_CONVERSIONS_DIMENSIONS
# ------------------------------------------
store_ga_to_sqlite(args.date, GOAL_CONVERSIONS_DIMENSIONS, GOAL_CONVERSIONS_METRICS,
"GOAL_CONVERSIONS")
store_ga_to_sqlite(args.date, GOAL_CONVERSIONS_DIMENSIONS, GOAL_CONVERSIONS_METRICS_1,
"GOAL_CONVERSIONS1")
store_ga_to_sqlite(args.date, GOAL_CONVERSIONS_DIMENSIONS, GOAL_CONVERSIONS_METRICS_2,
"GOAL_CONVERSIONS2")
store_ga_to_sqlite(args.date, GOAL_CONVERSIONS_DIMENSIONS, GOAL_CONVERSIONS_METRICS_3,
"GOAL_CONVERSIONS3")
store_ga_to_sqlite(args.date, GOAL_CONVERSIONS_DIMENSIONS, GOAL_CONVERSIONS_METRICS_4,
"GOAL_CONVERSIONS4")
store_ga_to_sqlite(args.date, GOAL_CONVERSIONS_DIMENSIONS, GOAL_CONVERSIONS_METRICS_5,
"GOAL_CONVERSIONS5")
# -------------------------------------------
# PLATFORM_OR_DEVICE_DIMENSIONS
# -------------------------------------------
sublist = divide_list(PLATFORM_OR_DEVICE_DIMENSIONS, 3)
for i, elem in enumerate(sublist):
store_ga_to_sqlite(args.date, elem, [], "PLATFORM_OR_DEVICE" + str(i))
# -------------------------------------------
# GEO_NETWORK_DIMENSIONS
# -------------------------------------------
sublist = divide_list(GEO_NETWORK_DIMENSIONS, 4)
for i, elem in enumerate(sublist):
store_ga_to_sqlite(args.date, elem, [], "GEO_NETWORK" + str(i))
# -------------------------------------------
# SYSTEM_DIMENSIONS
# -------------------------------------------
sublist = divide_list(SYSTEM_DIMENSIONS, 2)
for i, elem in enumerate(sublist):
store_ga_to_sqlite(args.date, elem, [], "SYSTEM" + str(i))
# -------------------------------------------
# PAGE_TRACKING_DIMENSIONS
# -------------------------------------------
sublist = divide_list(PAGE_TRACKING_DIMENSIONS, 4)
for i, elem in enumerate(sublist):
store_ga_to_sqlite(args.date, elem, PAGE_TRACKING_METRICS, "PAGE_TRACKING" + str(i))
store_ga_to_sqlite(args.date, EVENT_TRACKING_DIMENSIONS, EVENT_TRACKING_METRICS,
"EVENT_TRACKING")
sublist = divide_list(CUSTOM_VARIABLES_OR_COLUMNS_DIMENSIONS, 5)
for i, elem in enumerate(sublist):
store_ga_to_sqlite(args.date, elem, [], "CUSTOM_VARIABLES_OR_COLUMNS" + str(i))
if __name__ == '__main__':
main()
GCS_DIR = 'gs://YOUR_GCS_DIR'
DATA_SET_ID = 'YOUR_DATA_SET_ID'
TABLE_PREFIX = "ga_sessions_"
GOOGLE_APPLICATION_CREDENTIALS_PATH = './CREDENTIALS.json'
def main():
import argparse
parser = argparse.ArgumentParser(description='load_data_to_bigquery_from_gcs')
parser.add_argument('date', help='date')
args = parser.parse_args()
table_name = file_name = TABLE_PREFIX + args.date.replace("-", "")
print("Load start [table_name]", table_name)
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = GOOGLE_APPLICATION_CREDENTIALS_PATH
from google.cloud import bigquery
client = bigquery.Client()
data_set_id = DATA_SET_ID
data_set_ref = client.dataset(data_set_id)
job_config = bigquery.LoadJobConfig()
job_config.autodetect = False
job_config.quote_character = '"'
job_config.allow_quoted_newlines = True
job_config.skip_leading_rows = 1
# The source format defaults to CSV, so the line below is optional.
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = [
bigquery.SchemaField('avgSessionDuration','FLOAT','NULLABLE'),
bigquery.SchemaField('bounceRate','FLOAT','NULLABLE'),
bigquery.SchemaField('bounces','INTEGER','NULLABLE'),
bigquery.SchemaField('dimension10', 'TIMESTAMP', 'NULLABLE'),
bigquery.SchemaField('dimension8', 'STRING', 'NULLABLE'),
bigquery.SchemaField('hits', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('sessionDuration', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('sessionDurationBucket', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('sessions', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('uniqueDimensionCombinations', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('continent', 'STRING', 'NULLABLE'),
bigquery.SchemaField('country', 'STRING', 'NULLABLE'),
bigquery.SchemaField('metro', 'STRING', 'NULLABLE'),
bigquery.SchemaField('region', 'STRING', 'NULLABLE'),
bigquery.SchemaField('subContinent', 'STRING', 'NULLABLE'),
bigquery.SchemaField('visits', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('city', 'STRING', 'NULLABLE'),
bigquery.SchemaField('latitude', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('longitude', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('networkDomain', 'STRING', 'NULLABLE'),
bigquery.SchemaField('cityId', 'STRING', 'NULLABLE'),
bigquery.SchemaField('continentId', 'STRING', 'NULLABLE'),
bigquery.SchemaField('countryIsoCode', 'STRING', 'NULLABLE'),
bigquery.SchemaField('networkLocation', 'STRING', 'NULLABLE'),
bigquery.SchemaField('metroId', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('regionId', 'STRING', 'NULLABLE'),
bigquery.SchemaField('regionIsoCode', 'STRING', 'NULLABLE'),
bigquery.SchemaField('subContinentCode', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goalAbandonRateAll', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('goalAbandonsAll', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('goalCompletionLocation', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goalCompletionsAll', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('goalConversionRateAll', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('goalPreviousStep1', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goalPreviousStep2', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goalPreviousStep3', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goalStartsAll', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('goalValueAll', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('goalValuePerSession', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('goal1Starts', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal1Completions', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal1Value', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal1ConversionRate', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal1Abandons', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal1AbandonRate', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal2AbandonRate', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('goal2Abandons', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('goal2Completions', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('goal2ConversionRate', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('goal2Starts', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('goal2Value', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('goal3Starts', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal3Completions', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal3Value', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal3ConversionRate', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal3Abandons', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal3AbandonRate', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal4Starts', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal4Completions', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal4Value', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal4ConversionRate', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal4Abandons', 'STRING', 'NULLABLE'),
bigquery.SchemaField('goal4AbandonRate', 'STRING', 'NULLABLE'),
bigquery.SchemaField('browser', 'STRING', 'NULLABLE'),
bigquery.SchemaField('browserVersion', 'STRING', 'NULLABLE'),
bigquery.SchemaField('operatingSystem', 'STRING', 'NULLABLE'),
bigquery.SchemaField('operatingSystemVersion', 'STRING', 'NULLABLE'),
bigquery.SchemaField('mobileDeviceBranding', 'STRING', 'NULLABLE'),
bigquery.SchemaField('mobileDeviceInfo', 'STRING', 'NULLABLE'),
bigquery.SchemaField('mobileDeviceModel', 'STRING', 'NULLABLE'),
bigquery.SchemaField('mobileInputSelector', 'STRING', 'NULLABLE'),
bigquery.SchemaField('browserSize', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dataSource', 'STRING', 'NULLABLE'),
bigquery.SchemaField('deviceCategory', 'STRING', 'NULLABLE'),
bigquery.SchemaField('mobileDeviceMarketingName', 'STRING', 'NULLABLE'),
bigquery.SchemaField('flashVersion', 'STRING', 'NULLABLE'),
bigquery.SchemaField('javaEnabled', 'BOOLEAN', 'NULLABLE'),
bigquery.SchemaField('language', 'STRING', 'NULLABLE'),
bigquery.SchemaField('screenColors', 'STRING', 'NULLABLE'),
bigquery.SchemaField('sourcePropertyDisplayName', 'STRING', 'NULLABLE'),
bigquery.SchemaField('sourcePropertyTrackingId', 'STRING', 'NULLABLE'),
bigquery.SchemaField('screenResolution', 'STRING', 'NULLABLE'),
bigquery.SchemaField('avgTimeOnPage', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('entranceRate', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('entrances', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('exitRate', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('exits', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('hostname', 'STRING', 'NULLABLE'),
bigquery.SchemaField('pagePath', 'STRING', 'NULLABLE'),
bigquery.SchemaField('pagePathLevel1', 'STRING', 'NULLABLE'),
bigquery.SchemaField('pageValue', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('pageviews', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('pageviewsPerSession', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('timeOnPage', 'FLOAT', 'NULLABLE'),
bigquery.SchemaField('uniquePageviews', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('pagePathLevel2', 'STRING', 'NULLABLE'),
bigquery.SchemaField('pagePathLevel3', 'STRING', 'NULLABLE'),
bigquery.SchemaField('pagePathLevel4', 'STRING', 'NULLABLE'),
bigquery.SchemaField('landingPagePath', 'STRING', 'NULLABLE'),
bigquery.SchemaField('pageTitle', 'STRING', 'NULLABLE'),
bigquery.SchemaField('secondPagePath', 'STRING', 'NULLABLE'),
bigquery.SchemaField('exitPagePath', 'STRING', 'NULLABLE'),
bigquery.SchemaField('pageDepth', 'INTEGER', 'NULLABLE'),
bigquery.SchemaField('previousPagePath', 'STRING', 'NULLABLE'),
bigquery.SchemaField('eventCategory', 'STRING', 'NULLABLE'),
bigquery.SchemaField('eventAction', 'STRING', 'NULLABLE'),
bigquery.SchemaField('eventLabel', 'STRING', 'NULLABLE'),
bigquery.SchemaField('totalEvents', 'STRING', 'NULLABLE'),
bigquery.SchemaField('uniqueEvents', 'STRING', 'NULLABLE'),
bigquery.SchemaField('eventValue', 'STRING', 'NULLABLE'),
bigquery.SchemaField('avgEventValue', 'STRING', 'NULLABLE'),
bigquery.SchemaField('sessionsWithEvent', 'STRING', 'NULLABLE'),
bigquery.SchemaField('eventsPerSessionWithEvent', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension1', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension2', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension3', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension4', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension5', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension6', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension7', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension9', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension11', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension12', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension13', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension14', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension15', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension16', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension17', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension18', 'STRING', 'NULLABLE'),
bigquery.SchemaField('dimension19', 'STRING', 'NULLABLE'),
bigquery.SchemaField('fullPagePath', 'STRING', 'NULLABLE')
]
table_ref = client.dataset(data_set_id).table(table_name)
try:
client.delete_table(table_ref) # API request
except:
pass
uri = GCS_DIR + table_name + '.csv'
load_job = client.load_table_from_uri(
uri,
data_set_ref.table(table_name),
job_config=job_config) # API request
assert load_job.job_type == 'load'
try:
load_job.result() # Waits for table load to complete.
except Exception as e:
for error in load_job.errors:
print("Error detail [message]", error.get("message"))
raise e
assert load_job.state == 'DONE'
if __name__ == '__main__':
main()
from google2pandas import *
FILE_DIR_NAME = 'GCS_DIR_NAME'
GOOGLE_APPLICATION_CREDENTIALS_PATH = './CREDENTIALS.json'
BUCKET_NAME = 'YOUR_BUCKET_NAME'
CSV_PREFIX = "ga_sessions_"
DOMAIN_NAME = "https://www.monotalk.xyz"
def get_table_data_as_dataframe(date, category_name):
# SQLite3をインポート
import sqlite3
import pandas as pd
# db
db_name = "ga.db"
conn = sqlite3.connect(db_name)
c = conn.cursor()
table_name = "ga_sessions_" + date.replace("-", "") + "_" + category_name
try:
# dbをpandasのDataFrame型で読み込む
df = pd.read_sql("select * from " + table_name, conn)
except:
df = pd.DataFrame(columns=["dimension8", "dimension10"]);
return df
def main():
import argparse
parser = argparse.ArgumentParser(description='merge_data_to_gcs')
parser.add_argument('date', help='date')
args = parser.parse_args()
print("Merge data start [date]", args.date.replace("-", ""))
import pandas as pd
df = get_table_data_as_dataframe(args.date, "SESSION")
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GEO_NETWORK0"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GEO_NETWORK1"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GEO_NETWORK2"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GEO_NETWORK3"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GOAL_CONVERSIONS"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GOAL_CONVERSIONS0"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GOAL_CONVERSIONS1"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GOAL_CONVERSIONS2"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GOAL_CONVERSIONS3"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "GOAL_CONVERSIONS4"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "PLATFORM_OR_DEVICE0"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "PLATFORM_OR_DEVICE1"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "PLATFORM_OR_DEVICE2"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "SYSTEM0"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "SYSTEM1"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "PAGE_TRACKING0"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "PAGE_TRACKING1"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "PAGE_TRACKING2"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "PAGE_TRACKING3"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "EVENT_TRACKING"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "EVENT_TRACKING0"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "EVENT_TRACKING1"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "EVENT_TRACKING2"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "EVENT_TRACKING3"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "CUSTOM_VARIABLES_OR_COLUMNS0"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "CUSTOM_VARIABLES_OR_COLUMNS1"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "CUSTOM_VARIABLES_OR_COLUMNS2"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "CUSTOM_VARIABLES_OR_COLUMNS3"), how='left')
df = pd.merge(df, get_table_data_as_dataframe(args.date, "CUSTOM_VARIABLES_OR_COLUMNS4"), how='left')
df["fullPagePath"] = df['pagePath'].apply(lambda x: DOMAIN_NAME + str(x))
df = df.sample(frac=1)
temp_file_name = 'temp.csv'
df.to_csv(temp_file_name, index=False)
file_name = CSV_PREFIX + args.date.replace("-", "") + ".csv"
import os
from google.cloud import storage
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = GOOGLE_APPLICATION_CREDENTIALS_PATH
client = storage.Client()
# https://console.cloud.google.com/storage/browser/[bucket-id]/
bucket = client.get_bucket(BUCKET_NAME)
blob = bucket.blob(FILE_DIR_NAME + file_name)
blob.upload_from_filename(filename=temp_file_name)
if os.path.exists(temp_file_name): os.remove(temp_file_name)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment