Created
December 13, 2020 04:04
-
-
Save mengwangk/4463cef7c2dd048d89b96215d21cca58 to your computer and use it in GitHub Desktop.
01_data.loader.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# default_exp data.loader" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#hide\n", | |
"%load_ext autoreload\n", | |
"%autoreload 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"import os\n", | |
"import logging\n", | |
"import pandas as pd\n", | |
"import pymysql\n", | |
"import pymysql.cursors\n", | |
"from enum import Enum, auto\n", | |
"from sqlalchemy import create_engine, inspect\n", | |
"\n", | |
"logging.basicConfig(\n", | |
" format=\"%(asctime)s %(levelname)s(): %(message)s\", level=logging.DEBUG\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#hide\n", | |
"from nbdev.showdoc import *" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Data Loader\n", | |
"\n", | |
"> Generic data ingestion routines to ingest data from files to databases." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": "true" | |
}, | |
"source": [ | |
"## Decorator" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"def auto_str(cls):\n", | |
" \"Auto generate __str__\"\n", | |
"\n", | |
" def __str__(self):\n", | |
" return \"%s(%s)\" % (\n", | |
" type(self).__name__,\n", | |
" \", \".join(\"%s=%s\" % item for item in vars(self).items()),\n", | |
" )\n", | |
"\n", | |
" cls.__str__ = __str__\n", | |
" return cls" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": "true" | |
}, | |
"source": [ | |
"## Attribute Delegation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"class GetAttr:\n", | |
" \"Inherit from this to have all attr accesses in `self._xtra` passed down to `self.default`\"\n", | |
" _default='default'\n", | |
" def _component_attr_filter(self,k):\n", | |
" if k.startswith('__') or k in ('_xtra',self._default): return False\n", | |
" xtra = getattr(self,'_xtra',None)\n", | |
" return xtra is None or k in xtra\n", | |
" def _dir(self): return [k for k in dir(getattr(self,self._default)) if self._component_attr_filter(k)]\n", | |
" def __getattr__(self,k):\n", | |
" if self._component_attr_filter(k):\n", | |
" attr = getattr(self,self._default,None)\n", | |
" if attr is not None: return getattr(attr,k)\n", | |
" raise AttributeError(k)\n", | |
" def __dir__(self): return custom_dir(self,self._dir())\n", | |
" def __setstate__(self,data): self.__dict__.update(data)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"`ObjectFactor`, `DbSinkProvider` and `FileSourceProvider` are the factory classes." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"class ObjectFactory():\n", | |
" \"Generic object factory\"\n", | |
" def __init__(self):\n", | |
" self._builders = {}\n", | |
"\n", | |
" def register_builder(self, key, builder):\n", | |
" self._builders[key] = builder\n", | |
"\n", | |
" def create(self, key, **kwargs):\n", | |
" builder = self._builders.get(key)\n", | |
" if not builder:\n", | |
" raise ValueError(key)\n", | |
" return builder(**kwargs)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"class DbTargetProvider(ObjectFactory):\n", | |
" \"Database provider\"\n", | |
"\n", | |
" def get(self, id, **kwargs):\n", | |
" \"\"\"Create the database interface\"\"\"\n", | |
" return self.create(id, **kwargs)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export \n", | |
"class FileSourceProvider(ObjectFactory):\n", | |
" \"Supported file sources\"\n", | |
"\n", | |
" def get(self, id, **kwargs):\n", | |
" \"\"\"Create the file interface\"\"\"\n", | |
" return self.create(id, **kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"`DatabaseType` and `FileSource` classes." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"class DatabaseTarget(Enum):\n", | |
" PostgreSQL = auto()\n", | |
" MySQL = auto()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"class FileSource(Enum):\n", | |
" CSV = auto()\n", | |
" Excel = auto()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Target Databases" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### PostgreSQL" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"class PgSqlDbBuilder:\n", | |
" \"\"\"PostgreSQL database builder.\"\"\"\n", | |
"\n", | |
" def __init__(self):\n", | |
" self._instance = None\n", | |
"\n", | |
" def __call__(self, host, port, db, user, password, **_ignored):\n", | |
" if not self._instance:\n", | |
" self._instance = PgSqlDb(\n", | |
" host,\n", | |
" port,\n", | |
" db,\n", | |
" user,\n", | |
" password\n", | |
" )\n", | |
" return self._instance\n", | |
"\n", | |
"@auto_str\n", | |
"class PgSqlDb:\n", | |
" \"\"\"PostgreSQL database destination.\"\"\"\n", | |
"\n", | |
" def __init__(self, host, port, db, user, password):\n", | |
" self._host = host\n", | |
" self._port = port\n", | |
" self._db = db\n", | |
" self._user = user\n", | |
" self._password = password\n", | |
"\n", | |
" def get_engine(self):\n", | |
" \"\"\"Create and return sqlalchemy engine.\"\"\"\n", | |
" return create_engine(self.get_conn_str())\n", | |
"\n", | |
" def get_conn_str(self):\n", | |
" \"\"\"Return the connection string.\"\"\"\n", | |
" return f\"postgresql+psycopg2://{self._user}:{self._password}@{self._host}:{self._port}/{self._db}\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### MySQL" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"class MySqlDbBuilder:\n", | |
" \"\"\"MySQL database builder.\"\"\"\n", | |
"\n", | |
" def __init__(self):\n", | |
" self._instance = None\n", | |
"\n", | |
" def __call__(self, host, port, db, user, password, **_ignored):\n", | |
" if not self._instance:\n", | |
" self._instance = MySqlDb(\n", | |
" host,\n", | |
" port,\n", | |
" db,\n", | |
" user,\n", | |
" password\n", | |
" )\n", | |
" return self._instance\n", | |
"\n", | |
"@auto_str\n", | |
"class MySqlDb:\n", | |
" \"\"\"MySQL database destination.\"\"\"\n", | |
"\n", | |
" def __init__(self, host, port, db, user, password):\n", | |
" self._host = host\n", | |
" self._port = port\n", | |
" self._db = db\n", | |
" self._user = user\n", | |
" self._password = password\n", | |
"\n", | |
" def get_engine(self):\n", | |
" \"\"\"Create and return sqlalchemy engine.\"\"\"\n", | |
" return create_engine(self.get_conn_str())\n", | |
"\n", | |
" def get_conn_str(self):\n", | |
" \"\"\"Return the connection string.\"\"\"\n", | |
" return f\"mysql+pymysql://{self._user}:{self._password}@{self._host}:{self._port}/{self._db}?charset=utf8mb4\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Supported Sources" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"\n", | |
"def create_excel_file_source(file_path, **args):\n", | |
" \"\"\"Create Excel file source.\"\"\"\n", | |
" return ExcelSource(file_path, **args) \n", | |
" \n", | |
"def create_csv_file_source(file_path, **args):\n", | |
" \"\"\"Create CSV file source.\"\"\"\n", | |
" return CSVSource(file_path, **args) \n", | |
" \n", | |
"class ExcelSource:\n", | |
" \"\"\"Excel file source.\"\"\"\n", | |
"\n", | |
" def __init__(self, file_path, **args):\n", | |
" self._file_path = file_path\n", | |
" self._args = args\n", | |
"\n", | |
" def filepath(self):\n", | |
" return self._file_path\n", | |
" \n", | |
" def get_data(self):\n", | |
" \"\"\"Read the file and return a `DataFrame`\"\"\"\n", | |
" return pd.read_excel(self._file_path, engine='openpyxl', **self._args)\n", | |
" \n", | |
"class CSVSource:\n", | |
" \"\"\"CSV file source.\"\"\"\n", | |
"\n", | |
" def __init__(self, file_path, **args):\n", | |
" self._file_path = file_path\n", | |
" self._args = args\n", | |
" \n", | |
" def filepath(self):\n", | |
" return self._file_path\n", | |
"\n", | |
" def get_data(self):\n", | |
" \"\"\"Read the file and return a `DataFrame`\"\"\"\n", | |
" return pd.read_csv(self._file_path, engine=None, **self._args)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Ingestion" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"\n", | |
"# Register supported database providers\n", | |
"db_targets = DbTargetProvider()\n", | |
"db_targets.register_builder(DatabaseTarget.PostgreSQL, PgSqlDbBuilder())\n", | |
"db_targets.register_builder(DatabaseTarget.MySQL, MySqlDbBuilder())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"\n", | |
"# Register supported file types\n", | |
"file_sources = FileSourceProvider()\n", | |
"file_sources.register_builder(FileSource.Excel, create_excel_file_source)\n", | |
"file_sources.register_builder(FileSource.CSV, create_csv_file_source)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#export\n", | |
"def ingest(file_source, target_db, table_name, *, if_exists='append', method='multi', schema=None):\n", | |
" \"\"\"Ingest the file into the database table.\"\"\"\n", | |
" \n", | |
" # Create db engine\n", | |
" engine = target_db.get_engine()\n", | |
"\n", | |
" # Inspect the target table schema\n", | |
" inspector = inspect(engine)\n", | |
" dtypes = {}\n", | |
" for column in inspector.get_columns(table_name, schema=schema):\n", | |
" dtypes[column[\"name\"]] = column[\"type\"]\n", | |
" logging.info(dtypes)\n", | |
"\n", | |
" # Load the excel into database\n", | |
" df = file_source.get_data()\n", | |
" df.to_sql(\n", | |
" table_name, engine, if_exists=if_exists, method=method, chunksize=500, index=False, dtype=dtypes\n", | |
" )\n", | |
"\n", | |
" # TODO - Validation\n", | |
" print(f\"\\nTotal records in {file_source.filepath()} - {len(df)}\")\n", | |
" for c in df.columns:\n", | |
" print(f\"{c} - {df[c].nunique()}\")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>user_id</th>\n", | |
" <th>username</th>\n", | |
" <th>password</th>\n", | |
" <th>email</th>\n", | |
" <th>created_on</th>\n", | |
" <th>last_login</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>user1</td>\n", | |
" <td>user11</td>\n", | |
" <td>user1@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>user2</td>\n", | |
" <td>user22</td>\n", | |
" <td>user2@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>user3</td>\n", | |
" <td>user33</td>\n", | |
" <td>user3@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>user4</td>\n", | |
" <td>user44</td>\n", | |
" <td>user4@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>user5</td>\n", | |
" <td>user55</td>\n", | |
" <td>user5@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>95</th>\n", | |
" <td>96</td>\n", | |
" <td>user96</td>\n", | |
" <td>user9696</td>\n", | |
" <td>user96@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>96</th>\n", | |
" <td>97</td>\n", | |
" <td>user97</td>\n", | |
" <td>user9797</td>\n", | |
" <td>user97@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>97</th>\n", | |
" <td>98</td>\n", | |
" <td>user98</td>\n", | |
" <td>user9898</td>\n", | |
" <td>user98@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>98</th>\n", | |
" <td>99</td>\n", | |
" <td>user99</td>\n", | |
" <td>user9999</td>\n", | |
" <td>user99@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99</th>\n", | |
" <td>100</td>\n", | |
" <td>user100</td>\n", | |
" <td>user100100</td>\n", | |
" <td>user100@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00.000003</td>\n", | |
" <td>2020-12-07 13:00:00.000003</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>100 rows × 6 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" user_id username password email created_on \\\n", | |
"0 1 user1 user11 user1@abc.com 2020-12-06 04:00:00.000003 \n", | |
"1 2 user2 user22 user2@abc.com 2020-12-06 04:00:00.000003 \n", | |
"2 3 user3 user33 user3@abc.com 2020-12-06 04:00:00.000003 \n", | |
"3 4 user4 user44 user4@abc.com 2020-12-06 04:00:00.000003 \n", | |
"4 5 user5 user55 user5@abc.com 2020-12-06 04:00:00.000003 \n", | |
".. ... ... ... ... ... \n", | |
"95 96 user96 user9696 user96@abc.com 2020-12-06 04:00:00.000003 \n", | |
"96 97 user97 user9797 user97@abc.com 2020-12-06 04:00:00.000003 \n", | |
"97 98 user98 user9898 user98@abc.com 2020-12-06 04:00:00.000003 \n", | |
"98 99 user99 user9999 user99@abc.com 2020-12-06 04:00:00.000003 \n", | |
"99 100 user100 user100100 user100@abc.com 2020-12-06 04:00:00.000003 \n", | |
"\n", | |
" last_login \n", | |
"0 2020-12-07 13:00:00.000003 \n", | |
"1 2020-12-07 13:00:00.000003 \n", | |
"2 2020-12-07 13:00:00.000003 \n", | |
"3 2020-12-07 13:00:00.000003 \n", | |
"4 2020-12-07 13:00:00.000003 \n", | |
".. ... \n", | |
"95 2020-12-07 13:00:00.000003 \n", | |
"96 2020-12-07 13:00:00.000003 \n", | |
"97 2020-12-07 13:00:00.000003 \n", | |
"98 2020-12-07 13:00:00.000003 \n", | |
"99 2020-12-07 13:00:00.000003 \n", | |
"\n", | |
"[100 rows x 6 columns]" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Create a Excel file source\n", | |
"excel_source = file_sources.get(FileSource.Excel, file_path=\"data/accounts.xlsx\")\n", | |
"excel_source.get_data()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>user_id</th>\n", | |
" <th>username</th>\n", | |
" <th>password</th>\n", | |
" <th>email</th>\n", | |
" <th>created_on</th>\n", | |
" <th>last_login</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>user1</td>\n", | |
" <td>user11</td>\n", | |
" <td>user1@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>user2</td>\n", | |
" <td>user22</td>\n", | |
" <td>user2@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>user3</td>\n", | |
" <td>user33</td>\n", | |
" <td>user3@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>user4</td>\n", | |
" <td>user44</td>\n", | |
" <td>user4@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>user5</td>\n", | |
" <td>user55</td>\n", | |
" <td>user5@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>95</th>\n", | |
" <td>96</td>\n", | |
" <td>user96</td>\n", | |
" <td>user9696</td>\n", | |
" <td>user96@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>96</th>\n", | |
" <td>97</td>\n", | |
" <td>user97</td>\n", | |
" <td>user9797</td>\n", | |
" <td>user97@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>97</th>\n", | |
" <td>98</td>\n", | |
" <td>user98</td>\n", | |
" <td>user9898</td>\n", | |
" <td>user98@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>98</th>\n", | |
" <td>99</td>\n", | |
" <td>user99</td>\n", | |
" <td>user9999</td>\n", | |
" <td>user99@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99</th>\n", | |
" <td>100</td>\n", | |
" <td>user100</td>\n", | |
" <td>user100100</td>\n", | |
" <td>user100@abc.com</td>\n", | |
" <td>2020-12-06 04:00:00</td>\n", | |
" <td>2020-12-07 13:00:00</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>100 rows × 6 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" user_id username password email created_on \\\n", | |
"0 1 user1 user11 user1@abc.com 2020-12-06 04:00:00 \n", | |
"1 2 user2 user22 user2@abc.com 2020-12-06 04:00:00 \n", | |
"2 3 user3 user33 user3@abc.com 2020-12-06 04:00:00 \n", | |
"3 4 user4 user44 user4@abc.com 2020-12-06 04:00:00 \n", | |
"4 5 user5 user55 user5@abc.com 2020-12-06 04:00:00 \n", | |
".. ... ... ... ... ... \n", | |
"95 96 user96 user9696 user96@abc.com 2020-12-06 04:00:00 \n", | |
"96 97 user97 user9797 user97@abc.com 2020-12-06 04:00:00 \n", | |
"97 98 user98 user9898 user98@abc.com 2020-12-06 04:00:00 \n", | |
"98 99 user99 user9999 user99@abc.com 2020-12-06 04:00:00 \n", | |
"99 100 user100 user100100 user100@abc.com 2020-12-06 04:00:00 \n", | |
"\n", | |
" last_login \n", | |
"0 2020-12-07 13:00:00 \n", | |
"1 2020-12-07 13:00:00 \n", | |
"2 2020-12-07 13:00:00 \n", | |
"3 2020-12-07 13:00:00 \n", | |
"4 2020-12-07 13:00:00 \n", | |
".. ... \n", | |
"95 2020-12-07 13:00:00 \n", | |
"96 2020-12-07 13:00:00 \n", | |
"97 2020-12-07 13:00:00 \n", | |
"98 2020-12-07 13:00:00 \n", | |
"99 2020-12-07 13:00:00 \n", | |
"\n", | |
"[100 rows x 6 columns]" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Create a CSV file source\n", | |
"csv_source = file_sources.get(FileSource.CSV, file_path=\"data/accounts.csv\")\n", | |
"csv_source.get_data()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'postgresql+psycopg2://user1:userpwd@localhost:5432/testdb'" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Create PostgreSQL target\n", | |
"config = {\n", | |
" 'host': 'localhost',\n", | |
" 'port': 5432,\n", | |
" 'db': 'testdb',\n", | |
" 'user': 'user1',\n", | |
" 'password': 'userpwd'\n", | |
"}\n", | |
"pgsql_target = db_targets.get(DatabaseTarget.PostgreSQL, **config)\n", | |
"pgsql_target.get_conn_str()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2020-12-12 21:33:38,062 INFO(): {'user_id': INTEGER(), 'username': VARCHAR(length=50), 'password': VARCHAR(length=50), 'email': VARCHAR(length=255), 'created_on': TIMESTAMP(), 'last_login': TIMESTAMP()}\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"Total records in data/accounts.xlsx - 100\n", | |
"user_id - 100\n", | |
"username - 100\n", | |
"password - 100\n", | |
"email - 100\n", | |
"created_on - 1\n", | |
"last_login - 1\n" | |
] | |
} | |
], | |
"source": [ | |
"# Ingest to PostgreSQL\n", | |
"ingest(excel_source, pgsql_target, 'accounts')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2020-12-12 21:33:38,214 INFO(): {'user_id': INTEGER(), 'username': VARCHAR(length=50), 'password': VARCHAR(length=50), 'email': VARCHAR(length=255), 'created_on': TIMESTAMP(), 'last_login': TIMESTAMP()}\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"Total records in data/accounts.xlsx - 100\n", | |
"user_id - 100\n", | |
"username - 100\n", | |
"password - 100\n", | |
"email - 100\n", | |
"created_on - 1\n", | |
"last_login - 1\n" | |
] | |
} | |
], | |
"source": [ | |
"# Ingest and replace\n", | |
"ingest(excel_source, pgsql_target, 'accounts', if_exists='replace')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'mysql+pymysql://user1:userpwd@localhost:3306/testdb?charset=utf8mb4'" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Create MySQL target\n", | |
"config = {\n", | |
" 'host': 'localhost',\n", | |
" 'port': 3306,\n", | |
" 'db': 'testdb',\n", | |
" 'user': 'user1',\n", | |
" 'password': 'userpwd'\n", | |
"}\n", | |
"mysql_target = db_targets.get(DatabaseTarget.MySQL, **config)\n", | |
"mysql_target.get_conn_str()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2020-12-12 21:33:38,710 INFO(): {'user_id': INTEGER(), 'username': VARCHAR(length=50), 'password': VARCHAR(length=50), 'email': VARCHAR(length=255), 'created_on': TIMESTAMP(), 'last_login': TIMESTAMP()}\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"Total records in data/accounts.xlsx - 100\n", | |
"user_id - 100\n", | |
"username - 100\n", | |
"password - 100\n", | |
"email - 100\n", | |
"created_on - 1\n", | |
"last_login - 1\n" | |
] | |
} | |
], | |
"source": [ | |
"# Ingest to MySQL\n", | |
"ingest(excel_source, mysql_target, 'accounts')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2020-12-12 21:33:39,131 INFO(): {'user_id': INTEGER(), 'username': VARCHAR(length=50), 'password': VARCHAR(length=50), 'email': VARCHAR(length=255), 'created_on': TIMESTAMP(), 'last_login': TIMESTAMP()}\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"Total records in data/accounts.xlsx - 100\n", | |
"user_id - 100\n", | |
"username - 100\n", | |
"password - 100\n", | |
"email - 100\n", | |
"created_on - 1\n", | |
"last_login - 1\n" | |
] | |
} | |
], | |
"source": [ | |
"# Ingest and replace \n", | |
"ingest(excel_source, mysql_target, 'accounts', if_exists='replace')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Export -" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Converted 01_data.loader.ipynb.\n", | |
"Converted index.ipynb.\n" | |
] | |
} | |
], | |
"source": [ | |
"#hide\n", | |
"from nbdev.export import notebook2script\n", | |
"notebook2script()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment