Skip to content

Instantly share code, notes, and snippets.

@kemsakurai
Last active November 22, 2022 05:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save kemsakurai/abcbc90eea95eecc8f07c7644ca16445 to your computer and use it in GitHub Desktop.
Save kemsakurai/abcbc90eea95eecc8f07c7644ca16445 to your computer and use it in GitHub Desktop.
Google Search Console のデータを BigQuery に登録する Python スクリプト
GSC_PROPERTY_NAME = 'gsc_property_name'
FILE_DIR_NAME = 'gsc_dir_name'
BUCKET_NAME = 'gsc_bucket_name'
GOOGLE_APPLICATION_CREDENTIALS_PATH = './credentials.json'
CSV_PREFIX = "gsc_"
GSC_SERVICE_ACCOUNT_FILE = './gsc_client.json'
def main():
import argparse
parser = argparse.ArgumentParser(description='gsc_to_gcs')
parser.add_argument('date', help='date')
args = parser.parse_args()
file_name = CSV_PREFIX + args.date.replace("-", "") + ".csv"
print(file_name, ":create start")
import searchconsole
account = searchconsole.authenticate(service_account=GSC_SERVICE_ACCOUNT_FILE)
web_property = account[GSC_PROPERTY_NAME]
report = web_property.query.range(start=args.date, stop=args.date) \
.dimension('query', 'date', 'country', 'device', 'page').get()
df = report.to_dataframe()
import urllib.parse
df['page'] = df['page'].apply(lambda x: urllib.parse.unquote(x))
temp_file_name = 'temp.csv'
df.to_csv(temp_file_name, index=False)
import os
from google.cloud import storage
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = GOOGLE_APPLICATION_CREDENTIALS_PATH
client = storage.Client()
# https://console.cloud.google.com/storage/browser/[bucket-id]/
bucket = client.get_bucket(BUCKET_NAME)
blob = bucket.blob(FILE_DIR_NAME + file_name)
blob.upload_from_filename(filename=temp_file_name)
# Tempファイルの削除
if os.path.exists(temp_file_name): os.remove(temp_file_name)
if __name__ == '__main__':
main()
GCS_DIR = 'gs://gcs_dir'
DATA_SET_ID = 'BigQuery dataset ID'
GOOGLE_APPLICATION_CREDENTIALS_PATH = './credentials.json'
TABLE_PREFIX = "gsc_"
def main():
import argparse
parser = argparse.ArgumentParser(description='load_data_to_bigquery_from_gcs')
parser.add_argument('date', help='date')
args = parser.parse_args()
table_name = file_name = TABLE_PREFIX + args.date.replace("-", "")
print(table_name, ":create start")
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = GOOGLE_APPLICATION_CREDENTIALS_PATH
from google.cloud import bigquery
client = bigquery.Client()
data_set_id = DATA_SET_ID
data_set_ref = client.dataset(data_set_id)
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
job_config.skip_leading_rows = 1
# The source format defaults to CSV, so the line below is optional.
job_config.source_format = bigquery.SourceFormat.CSV
table_ref = client.dataset(data_set_id).table(table_name)
try:
client.delete_table(table_ref) # API request
except:
pass
uri = GCS_DIR + table_name + '.csv'
load_job = client.load_table_from_uri(
uri,
data_set_ref.table(table_name),
job_config=job_config) # API request
assert load_job.job_type == 'load'
load_job.result() # Waits for table load to complete.
assert load_job.state == 'DONE'
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment