Skip to content

Instantly share code, notes, and snippets.

@sounishnath003
Last active June 7, 2022 09:42
Show Gist options
  • Save sounishnath003/a6dd848cff9c07046d7783435c051605 to your computer and use it in GitHub Desktop.
Save sounishnath003/a6dd848cff9c07046d7783435c051605 to your computer and use it in GitHub Desktop.
"""
# _* coding: utf8 *_
filename: excel_to_bigquery_connector.py
@author: sounishnath
createdAt: 2022-06-03 23:47:47
# Modules Required:
- pip install google-cloud-bigquery
- pip install pandas-gbq
"""
# Upload Excel file to BigQuery Table from local file system
from argparse import ArgumentParser
from glob import glob
from tqdm import tqdm
def upload_and_convert_excel_files_into_bigquery(
directory_path, bq_project_name, credential_json_filepath
):
"""
- Grab all the excel files from the directory_path
- Create a BigQuery table from the excel files
- Upload the excel files to the BigQuery table
"""
# Import the necessary packages
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
# Set the environment variable
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credential_json_filepath
# Initialize the credentials
credentials = service_account.Credentials.from_service_account_file(
credential_json_filepath
)
# Initialize the BigQuery client
client = bigquery.Client(credentials=credentials, project=bq_project_name)
# Grab the excel files from the directory_path
excel_files = glob("{}/*.xlsx".format(directory_path))
# Create a BigQuery table from the excel files
for excel_file in tqdm(excel_files):
# Grab the excel file name
excel_file_name = os.path.basename(excel_file)
# Grab the excel file name without the extension
excel_file_name_without_extension = os.path.splitext(excel_file_name)[0]
# Grab the excel file name without the extension
excel_file_name_without_extension = os.path.splitext(excel_file_name)[0]
# Create the table name
table_name = "{}:samples.{}".format(
bq_project_name, excel_file_name_without_extension
)
df = pd.read_excel(excel_file)
column_names = df.columns.values.tolist()
# Create the table schema
table_schema = [
bigquery.SchemaField(col, "STRING", mode="REQUIRED")
if df[col].dtype == "object"
else bigquery.SchemaField(col, "INT", mode="REQUIRED")
for col in column_names
]
# Create the table
table = bigquery.Table(table_name, schema=table_schema)
table = client.create_table(table)
# Upload the excel files to the BigQuery table
df.to_gbq(
table_name,
project_id=bq_project_name,
if_exists="append",
credentials=credentials,
progress_bar=True,
location='asia-south1', # Mumbai
)
if __name__ == "__main__":
parser=ArgumentParser(description="migrate directory excel files into BQ", usage="excel-to-bigquery --directory='' --bq_project_name=<name> --credentials=<credentials.json>")
parser.add_argument('--directory', type=str)
parser.add_argument('--bq_project_name', type=str)
parser.add_argument('--credentials', type=str)
config=vars(parser.parse_args())
print(config)
# Set the directory path
directory_path = config.get('directory')
# Set the BigQuery project name
bq_project_name = config.get('bq_project_name')
# Set the credential json file path
credential_json_filepath = config.get('credentials')
# Upload the excel files to the BigQuery table
upload_and_convert_excel_files_into_bigquery(
directory_path, bq_project_name, credential_json_filepath
)
# References
# https://cloud.google.com/bigquery/docs/reference/libraries#client-libraries-install-python
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment