Created
October 30, 2023 21:28
-
-
Save dataders/5f532ad24348e577f40ed0c9017fd47f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'2.1.1'" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"pd.__version__" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ID int64[pyarrow]\n", | |
"FIRST_NAME string[pyarrow]\n", | |
"LAST_NAME string[pyarrow]\n", | |
"dtype: object\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>FIRST_NAME</th>\n", | |
" <th>LAST_NAME</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>Michael</td>\n", | |
" <td>P.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>Shawn</td>\n", | |
" <td>M.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>Kathleen</td>\n", | |
" <td>P.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>Jimmy</td>\n", | |
" <td>C.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>Katherine</td>\n", | |
" <td>R.</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID FIRST_NAME LAST_NAME\n", | |
"0 1 Michael P.\n", | |
"1 2 Shawn M.\n", | |
"2 3 Kathleen P.\n", | |
"3 4 Jimmy C.\n", | |
"4 5 Katherine R." | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_custo = pd.read_csv('~/Downloads/jaffle_shop_customers.csv',\n", | |
" dtype_backend='pyarrow'\n", | |
" )\n", | |
"print(df_custo.dtypes)\n", | |
"df_custo.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_custo.to_parquet('jaffle_shop_customers.parquet',\n", | |
" index=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## payments" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ID int64[pyarrow]\n", | |
"ORDERID int64[pyarrow]\n", | |
"PAYMENTMETHOD string[pyarrow]\n", | |
"STATUS string[pyarrow]\n", | |
"AMOUNT int64[pyarrow]\n", | |
"CREATED datetime64[ns]\n", | |
"dtype: object\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>ORDERID</th>\n", | |
" <th>PAYMENTMETHOD</th>\n", | |
" <th>STATUS</th>\n", | |
" <th>AMOUNT</th>\n", | |
" <th>CREATED</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>credit_card</td>\n", | |
" <td>success</td>\n", | |
" <td>1000</td>\n", | |
" <td>2018-01-01</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" <td>credit_card</td>\n", | |
" <td>success</td>\n", | |
" <td>2000</td>\n", | |
" <td>2018-01-02</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>coupon</td>\n", | |
" <td>success</td>\n", | |
" <td>100</td>\n", | |
" <td>2018-01-04</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>4</td>\n", | |
" <td>coupon</td>\n", | |
" <td>success</td>\n", | |
" <td>2500</td>\n", | |
" <td>2018-01-05</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" <td>bank_transfer</td>\n", | |
" <td>fail</td>\n", | |
" <td>1700</td>\n", | |
" <td>2018-01-05</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID ORDERID PAYMENTMETHOD STATUS AMOUNT CREATED\n", | |
"0 1 1 credit_card success 1000 2018-01-01\n", | |
"1 2 2 credit_card success 2000 2018-01-02\n", | |
"2 3 3 coupon success 100 2018-01-04\n", | |
"3 4 4 coupon success 2500 2018-01-05\n", | |
"4 5 5 bank_transfer fail 1700 2018-01-05" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_pay = pd.read_csv('~/Downloads/stripe_payments.csv',\n", | |
" parse_dates=['CREATED'],\n", | |
" dtype_backend='pyarrow'\n", | |
" )\n", | |
"print(df_pay.dtypes)\n", | |
"df_pay.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_pay.to_parquet('stripe_payments.parquet',\n", | |
" engine='pyarrow',\n", | |
" use_deprecated_int96_timestamps=True,\n", | |
" # allow_truncated_timestamps=True,\n", | |
" index=False\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## orders" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"ID int64[pyarrow]\n", | |
"USER_ID int64[pyarrow]\n", | |
"ORDER_DATE datetime64[ns]\n", | |
"STATUS string[pyarrow]\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_orders = pd.read_csv('~/Downloads/jaffle_shop_orders.csv',\n", | |
" parse_dates=['ORDER_DATE'],\n", | |
" dtype_backend='pyarrow'\n", | |
" )\n", | |
"\n", | |
"df_orders.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_orders.to_parquet('jaffle_shop_orders.parquet',\n", | |
" engine='pyarrow',\n", | |
" use_deprecated_int96_timestamps=True,\n", | |
" # allow_truncated_timestamps=True,\n", | |
" index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ID int64[pyarrow]\n", | |
"USER_ID int64[pyarrow]\n", | |
"ORDER_DATE datetime64[ns]\n", | |
"STATUS string[pyarrow]\n", | |
"dtype: object\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>USER_ID</th>\n", | |
" <th>ORDER_DATE</th>\n", | |
" <th>STATUS</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>2018-01-01</td>\n", | |
" <td>returned</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>2018-01-02</td>\n", | |
" <td>completed</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>94</td>\n", | |
" <td>2018-01-04</td>\n", | |
" <td>completed</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>50</td>\n", | |
" <td>2018-01-05</td>\n", | |
" <td>completed</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>64</td>\n", | |
" <td>2018-01-05</td>\n", | |
" <td>completed</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94</th>\n", | |
" <td>95</td>\n", | |
" <td>27</td>\n", | |
" <td>2018-04-04</td>\n", | |
" <td>placed</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>95</th>\n", | |
" <td>96</td>\n", | |
" <td>90</td>\n", | |
" <td>2018-04-06</td>\n", | |
" <td>placed</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>96</th>\n", | |
" <td>97</td>\n", | |
" <td>89</td>\n", | |
" <td>2018-04-07</td>\n", | |
" <td>placed</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>97</th>\n", | |
" <td>98</td>\n", | |
" <td>41</td>\n", | |
" <td>2018-04-07</td>\n", | |
" <td>placed</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>98</th>\n", | |
" <td>99</td>\n", | |
" <td>85</td>\n", | |
" <td>2018-04-09</td>\n", | |
" <td>placed</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>99 rows × 4 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID USER_ID ORDER_DATE STATUS\n", | |
"0 1 1 2018-01-01 returned\n", | |
"1 2 3 2018-01-02 completed\n", | |
"2 3 94 2018-01-04 completed\n", | |
"3 4 50 2018-01-05 completed\n", | |
"4 5 64 2018-01-05 completed\n", | |
".. .. ... ... ...\n", | |
"94 95 27 2018-04-04 placed\n", | |
"95 96 90 2018-04-06 placed\n", | |
"96 97 89 2018-04-07 placed\n", | |
"97 98 41 2018-04-07 placed\n", | |
"98 99 85 2018-04-09 placed\n", | |
"\n", | |
"[99 rows x 4 columns]" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_new = pd.read_parquet('https://dbtlabsynapsedatalake.blob.core.windows.net/dbt-quickstart-public/jaffle_shop_orders.parquet')\n", | |
"print(df_new.dtypes)\n", | |
"df_new" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python (dev)", | |
"language": "python", | |
"name": "dev" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment