Skip to content

Instantly share code, notes, and snippets.

@mengwangk
Created December 13, 2020 04:04
Show Gist options
  • Save mengwangk/4463cef7c2dd048d89b96215d21cca58 to your computer and use it in GitHub Desktop.
Save mengwangk/4463cef7c2dd048d89b96215d21cca58 to your computer and use it in GitHub Desktop.
01_data.loader.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# default_exp data.loader"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#hide\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"import os\n",
"import logging\n",
"import pandas as pd\n",
"import pymysql\n",
"import pymysql.cursors\n",
"from enum import Enum, auto\n",
"from sqlalchemy import create_engine, inspect\n",
"\n",
"logging.basicConfig(\n",
" format=\"%(asctime)s %(levelname)s(): %(message)s\", level=logging.DEBUG\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#hide\n",
"from nbdev.showdoc import *"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Loader\n",
"\n",
"> Generic data ingestion routines to ingest data from files to databases."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": "true"
},
"source": [
"## Decorator"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"def auto_str(cls):\n",
" \"Auto generate __str__\"\n",
"\n",
" def __str__(self):\n",
" return \"%s(%s)\" % (\n",
" type(self).__name__,\n",
" \", \".join(\"%s=%s\" % item for item in vars(self).items()),\n",
" )\n",
"\n",
" cls.__str__ = __str__\n",
" return cls"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": "true"
},
"source": [
"## Attribute Delegation"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"class GetAttr:\n",
" \"Inherit from this to have all attr accesses in `self._xtra` passed down to `self.default`\"\n",
" _default='default'\n",
" def _component_attr_filter(self,k):\n",
" if k.startswith('__') or k in ('_xtra',self._default): return False\n",
" xtra = getattr(self,'_xtra',None)\n",
" return xtra is None or k in xtra\n",
" def _dir(self): return [k for k in dir(getattr(self,self._default)) if self._component_attr_filter(k)]\n",
" def __getattr__(self,k):\n",
" if self._component_attr_filter(k):\n",
" attr = getattr(self,self._default,None)\n",
" if attr is not None: return getattr(attr,k)\n",
" raise AttributeError(k)\n",
" def __dir__(self): return custom_dir(self,self._dir())\n",
" def __setstate__(self,data): self.__dict__.update(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`ObjectFactor`, `DbSinkProvider` and `FileSourceProvider` are the factory classes."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"class ObjectFactory():\n",
" \"Generic object factory\"\n",
" def __init__(self):\n",
" self._builders = {}\n",
"\n",
" def register_builder(self, key, builder):\n",
" self._builders[key] = builder\n",
"\n",
" def create(self, key, **kwargs):\n",
" builder = self._builders.get(key)\n",
" if not builder:\n",
" raise ValueError(key)\n",
" return builder(**kwargs)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"class DbTargetProvider(ObjectFactory):\n",
" \"Database provider\"\n",
"\n",
" def get(self, id, **kwargs):\n",
" \"\"\"Create the database interface\"\"\"\n",
" return self.create(id, **kwargs)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#export \n",
"class FileSourceProvider(ObjectFactory):\n",
" \"Supported file sources\"\n",
"\n",
" def get(self, id, **kwargs):\n",
" \"\"\"Create the file interface\"\"\"\n",
" return self.create(id, **kwargs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`DatabaseType` and `FileSource` classes."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"class DatabaseTarget(Enum):\n",
" PostgreSQL = auto()\n",
" MySQL = auto()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"class FileSource(Enum):\n",
" CSV = auto()\n",
" Excel = auto()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Target Databases"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### PostgreSQL"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"class PgSqlDbBuilder:\n",
" \"\"\"PostgreSQL database builder.\"\"\"\n",
"\n",
" def __init__(self):\n",
" self._instance = None\n",
"\n",
" def __call__(self, host, port, db, user, password, **_ignored):\n",
" if not self._instance:\n",
" self._instance = PgSqlDb(\n",
" host,\n",
" port,\n",
" db,\n",
" user,\n",
" password\n",
" )\n",
" return self._instance\n",
"\n",
"@auto_str\n",
"class PgSqlDb:\n",
" \"\"\"PostgreSQL database destination.\"\"\"\n",
"\n",
" def __init__(self, host, port, db, user, password):\n",
" self._host = host\n",
" self._port = port\n",
" self._db = db\n",
" self._user = user\n",
" self._password = password\n",
"\n",
" def get_engine(self):\n",
" \"\"\"Create and return sqlalchemy engine.\"\"\"\n",
" return create_engine(self.get_conn_str())\n",
"\n",
" def get_conn_str(self):\n",
" \"\"\"Return the connection string.\"\"\"\n",
" return f\"postgresql+psycopg2://{self._user}:{self._password}@{self._host}:{self._port}/{self._db}\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### MySQL"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"class MySqlDbBuilder:\n",
" \"\"\"MySQL database builder.\"\"\"\n",
"\n",
" def __init__(self):\n",
" self._instance = None\n",
"\n",
" def __call__(self, host, port, db, user, password, **_ignored):\n",
" if not self._instance:\n",
" self._instance = MySqlDb(\n",
" host,\n",
" port,\n",
" db,\n",
" user,\n",
" password\n",
" )\n",
" return self._instance\n",
"\n",
"@auto_str\n",
"class MySqlDb:\n",
" \"\"\"MySQL database destination.\"\"\"\n",
"\n",
" def __init__(self, host, port, db, user, password):\n",
" self._host = host\n",
" self._port = port\n",
" self._db = db\n",
" self._user = user\n",
" self._password = password\n",
"\n",
" def get_engine(self):\n",
" \"\"\"Create and return sqlalchemy engine.\"\"\"\n",
" return create_engine(self.get_conn_str())\n",
"\n",
" def get_conn_str(self):\n",
" \"\"\"Return the connection string.\"\"\"\n",
" return f\"mysql+pymysql://{self._user}:{self._password}@{self._host}:{self._port}/{self._db}?charset=utf8mb4\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Supported Sources"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"\n",
"def create_excel_file_source(file_path, **args):\n",
" \"\"\"Create Excel file source.\"\"\"\n",
" return ExcelSource(file_path, **args) \n",
" \n",
"def create_csv_file_source(file_path, **args):\n",
" \"\"\"Create CSV file source.\"\"\"\n",
" return CSVSource(file_path, **args) \n",
" \n",
"class ExcelSource:\n",
" \"\"\"Excel file source.\"\"\"\n",
"\n",
" def __init__(self, file_path, **args):\n",
" self._file_path = file_path\n",
" self._args = args\n",
"\n",
" def filepath(self):\n",
" return self._file_path\n",
" \n",
" def get_data(self):\n",
" \"\"\"Read the file and return a `DataFrame`\"\"\"\n",
" return pd.read_excel(self._file_path, engine='openpyxl', **self._args)\n",
" \n",
"class CSVSource:\n",
" \"\"\"CSV file source.\"\"\"\n",
"\n",
" def __init__(self, file_path, **args):\n",
" self._file_path = file_path\n",
" self._args = args\n",
" \n",
" def filepath(self):\n",
" return self._file_path\n",
"\n",
" def get_data(self):\n",
" \"\"\"Read the file and return a `DataFrame`\"\"\"\n",
" return pd.read_csv(self._file_path, engine=None, **self._args)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ingestion"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"\n",
"# Register supported database providers\n",
"db_targets = DbTargetProvider()\n",
"db_targets.register_builder(DatabaseTarget.PostgreSQL, PgSqlDbBuilder())\n",
"db_targets.register_builder(DatabaseTarget.MySQL, MySqlDbBuilder())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"\n",
"# Register supported file types\n",
"file_sources = FileSourceProvider()\n",
"file_sources.register_builder(FileSource.Excel, create_excel_file_source)\n",
"file_sources.register_builder(FileSource.CSV, create_csv_file_source)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"def ingest(file_source, target_db, table_name, *, if_exists='append', method='multi', schema=None):\n",
" \"\"\"Ingest the file into the database table.\"\"\"\n",
" \n",
" # Create db engine\n",
" engine = target_db.get_engine()\n",
"\n",
" # Inspect the target table schema\n",
" inspector = inspect(engine)\n",
" dtypes = {}\n",
" for column in inspector.get_columns(table_name, schema=schema):\n",
" dtypes[column[\"name\"]] = column[\"type\"]\n",
" logging.info(dtypes)\n",
"\n",
" # Load the excel into database\n",
" df = file_source.get_data()\n",
" df.to_sql(\n",
" table_name, engine, if_exists=if_exists, method=method, chunksize=500, index=False, dtype=dtypes\n",
" )\n",
"\n",
" # TODO - Validation\n",
" print(f\"\\nTotal records in {file_source.filepath()} - {len(df)}\")\n",
" for c in df.columns:\n",
" print(f\"{c} - {df[c].nunique()}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>username</th>\n",
" <th>password</th>\n",
" <th>email</th>\n",
" <th>created_on</th>\n",
" <th>last_login</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>user1</td>\n",
" <td>user11</td>\n",
" <td>user1@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>user2</td>\n",
" <td>user22</td>\n",
" <td>user2@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>user3</td>\n",
" <td>user33</td>\n",
" <td>user3@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>user4</td>\n",
" <td>user44</td>\n",
" <td>user4@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>user5</td>\n",
" <td>user55</td>\n",
" <td>user5@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>96</td>\n",
" <td>user96</td>\n",
" <td>user9696</td>\n",
" <td>user96@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>97</td>\n",
" <td>user97</td>\n",
" <td>user9797</td>\n",
" <td>user97@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>98</td>\n",
" <td>user98</td>\n",
" <td>user9898</td>\n",
" <td>user98@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>99</td>\n",
" <td>user99</td>\n",
" <td>user9999</td>\n",
" <td>user99@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>100</td>\n",
" <td>user100</td>\n",
" <td>user100100</td>\n",
" <td>user100@abc.com</td>\n",
" <td>2020-12-06 04:00:00.000003</td>\n",
" <td>2020-12-07 13:00:00.000003</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id username password email created_on \\\n",
"0 1 user1 user11 user1@abc.com 2020-12-06 04:00:00.000003 \n",
"1 2 user2 user22 user2@abc.com 2020-12-06 04:00:00.000003 \n",
"2 3 user3 user33 user3@abc.com 2020-12-06 04:00:00.000003 \n",
"3 4 user4 user44 user4@abc.com 2020-12-06 04:00:00.000003 \n",
"4 5 user5 user55 user5@abc.com 2020-12-06 04:00:00.000003 \n",
".. ... ... ... ... ... \n",
"95 96 user96 user9696 user96@abc.com 2020-12-06 04:00:00.000003 \n",
"96 97 user97 user9797 user97@abc.com 2020-12-06 04:00:00.000003 \n",
"97 98 user98 user9898 user98@abc.com 2020-12-06 04:00:00.000003 \n",
"98 99 user99 user9999 user99@abc.com 2020-12-06 04:00:00.000003 \n",
"99 100 user100 user100100 user100@abc.com 2020-12-06 04:00:00.000003 \n",
"\n",
" last_login \n",
"0 2020-12-07 13:00:00.000003 \n",
"1 2020-12-07 13:00:00.000003 \n",
"2 2020-12-07 13:00:00.000003 \n",
"3 2020-12-07 13:00:00.000003 \n",
"4 2020-12-07 13:00:00.000003 \n",
".. ... \n",
"95 2020-12-07 13:00:00.000003 \n",
"96 2020-12-07 13:00:00.000003 \n",
"97 2020-12-07 13:00:00.000003 \n",
"98 2020-12-07 13:00:00.000003 \n",
"99 2020-12-07 13:00:00.000003 \n",
"\n",
"[100 rows x 6 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a Excel file source\n",
"excel_source = file_sources.get(FileSource.Excel, file_path=\"data/accounts.xlsx\")\n",
"excel_source.get_data()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>username</th>\n",
" <th>password</th>\n",
" <th>email</th>\n",
" <th>created_on</th>\n",
" <th>last_login</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>user1</td>\n",
" <td>user11</td>\n",
" <td>user1@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>user2</td>\n",
" <td>user22</td>\n",
" <td>user2@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>user3</td>\n",
" <td>user33</td>\n",
" <td>user3@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>user4</td>\n",
" <td>user44</td>\n",
" <td>user4@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>user5</td>\n",
" <td>user55</td>\n",
" <td>user5@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>96</td>\n",
" <td>user96</td>\n",
" <td>user9696</td>\n",
" <td>user96@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>97</td>\n",
" <td>user97</td>\n",
" <td>user9797</td>\n",
" <td>user97@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>98</td>\n",
" <td>user98</td>\n",
" <td>user9898</td>\n",
" <td>user98@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>99</td>\n",
" <td>user99</td>\n",
" <td>user9999</td>\n",
" <td>user99@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>100</td>\n",
" <td>user100</td>\n",
" <td>user100100</td>\n",
" <td>user100@abc.com</td>\n",
" <td>2020-12-06 04:00:00</td>\n",
" <td>2020-12-07 13:00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id username password email created_on \\\n",
"0 1 user1 user11 user1@abc.com 2020-12-06 04:00:00 \n",
"1 2 user2 user22 user2@abc.com 2020-12-06 04:00:00 \n",
"2 3 user3 user33 user3@abc.com 2020-12-06 04:00:00 \n",
"3 4 user4 user44 user4@abc.com 2020-12-06 04:00:00 \n",
"4 5 user5 user55 user5@abc.com 2020-12-06 04:00:00 \n",
".. ... ... ... ... ... \n",
"95 96 user96 user9696 user96@abc.com 2020-12-06 04:00:00 \n",
"96 97 user97 user9797 user97@abc.com 2020-12-06 04:00:00 \n",
"97 98 user98 user9898 user98@abc.com 2020-12-06 04:00:00 \n",
"98 99 user99 user9999 user99@abc.com 2020-12-06 04:00:00 \n",
"99 100 user100 user100100 user100@abc.com 2020-12-06 04:00:00 \n",
"\n",
" last_login \n",
"0 2020-12-07 13:00:00 \n",
"1 2020-12-07 13:00:00 \n",
"2 2020-12-07 13:00:00 \n",
"3 2020-12-07 13:00:00 \n",
"4 2020-12-07 13:00:00 \n",
".. ... \n",
"95 2020-12-07 13:00:00 \n",
"96 2020-12-07 13:00:00 \n",
"97 2020-12-07 13:00:00 \n",
"98 2020-12-07 13:00:00 \n",
"99 2020-12-07 13:00:00 \n",
"\n",
"[100 rows x 6 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a CSV file source\n",
"csv_source = file_sources.get(FileSource.CSV, file_path=\"data/accounts.csv\")\n",
"csv_source.get_data()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'postgresql+psycopg2://user1:userpwd@localhost:5432/testdb'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create PostgreSQL target\n",
"config = {\n",
" 'host': 'localhost',\n",
" 'port': 5432,\n",
" 'db': 'testdb',\n",
" 'user': 'user1',\n",
" 'password': 'userpwd'\n",
"}\n",
"pgsql_target = db_targets.get(DatabaseTarget.PostgreSQL, **config)\n",
"pgsql_target.get_conn_str()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-12-12 21:33:38,062 INFO(): {'user_id': INTEGER(), 'username': VARCHAR(length=50), 'password': VARCHAR(length=50), 'email': VARCHAR(length=255), 'created_on': TIMESTAMP(), 'last_login': TIMESTAMP()}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Total records in data/accounts.xlsx - 100\n",
"user_id - 100\n",
"username - 100\n",
"password - 100\n",
"email - 100\n",
"created_on - 1\n",
"last_login - 1\n"
]
}
],
"source": [
"# Ingest to PostgreSQL\n",
"ingest(excel_source, pgsql_target, 'accounts')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-12-12 21:33:38,214 INFO(): {'user_id': INTEGER(), 'username': VARCHAR(length=50), 'password': VARCHAR(length=50), 'email': VARCHAR(length=255), 'created_on': TIMESTAMP(), 'last_login': TIMESTAMP()}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Total records in data/accounts.xlsx - 100\n",
"user_id - 100\n",
"username - 100\n",
"password - 100\n",
"email - 100\n",
"created_on - 1\n",
"last_login - 1\n"
]
}
],
"source": [
"# Ingest and replace\n",
"ingest(excel_source, pgsql_target, 'accounts', if_exists='replace')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'mysql+pymysql://user1:userpwd@localhost:3306/testdb?charset=utf8mb4'"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create MySQL target\n",
"config = {\n",
" 'host': 'localhost',\n",
" 'port': 3306,\n",
" 'db': 'testdb',\n",
" 'user': 'user1',\n",
" 'password': 'userpwd'\n",
"}\n",
"mysql_target = db_targets.get(DatabaseTarget.MySQL, **config)\n",
"mysql_target.get_conn_str()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-12-12 21:33:38,710 INFO(): {'user_id': INTEGER(), 'username': VARCHAR(length=50), 'password': VARCHAR(length=50), 'email': VARCHAR(length=255), 'created_on': TIMESTAMP(), 'last_login': TIMESTAMP()}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Total records in data/accounts.xlsx - 100\n",
"user_id - 100\n",
"username - 100\n",
"password - 100\n",
"email - 100\n",
"created_on - 1\n",
"last_login - 1\n"
]
}
],
"source": [
"# Ingest to MySQL\n",
"ingest(excel_source, mysql_target, 'accounts')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-12-12 21:33:39,131 INFO(): {'user_id': INTEGER(), 'username': VARCHAR(length=50), 'password': VARCHAR(length=50), 'email': VARCHAR(length=255), 'created_on': TIMESTAMP(), 'last_login': TIMESTAMP()}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Total records in data/accounts.xlsx - 100\n",
"user_id - 100\n",
"username - 100\n",
"password - 100\n",
"email - 100\n",
"created_on - 1\n",
"last_login - 1\n"
]
}
],
"source": [
"# Ingest and replace \n",
"ingest(excel_source, mysql_target, 'accounts', if_exists='replace')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Export -"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Converted 01_data.loader.ipynb.\n",
"Converted index.ipynb.\n"
]
}
],
"source": [
"#hide\n",
"from nbdev.export import notebook2script\n",
"notebook2script()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment