Skip to content

Instantly share code, notes, and snippets.

@dataders
Created October 30, 2023 21:28
Show Gist options
  • Save dataders/5f532ad24348e577f40ed0c9017fd47f to your computer and use it in GitHub Desktop.
Save dataders/5f532ad24348e577f40ed0c9017fd47f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2.1.1'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"pd.__version__"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID int64[pyarrow]\n",
"FIRST_NAME string[pyarrow]\n",
"LAST_NAME string[pyarrow]\n",
"dtype: object\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>FIRST_NAME</th>\n",
" <th>LAST_NAME</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Michael</td>\n",
" <td>P.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Shawn</td>\n",
" <td>M.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Kathleen</td>\n",
" <td>P.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Jimmy</td>\n",
" <td>C.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Katherine</td>\n",
" <td>R.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID FIRST_NAME LAST_NAME\n",
"0 1 Michael P.\n",
"1 2 Shawn M.\n",
"2 3 Kathleen P.\n",
"3 4 Jimmy C.\n",
"4 5 Katherine R."
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_custo = pd.read_csv('~/Downloads/jaffle_shop_customers.csv',\n",
" dtype_backend='pyarrow'\n",
" )\n",
"print(df_custo.dtypes)\n",
"df_custo.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"df_custo.to_parquet('jaffle_shop_customers.parquet',\n",
" index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## payments"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID int64[pyarrow]\n",
"ORDERID int64[pyarrow]\n",
"PAYMENTMETHOD string[pyarrow]\n",
"STATUS string[pyarrow]\n",
"AMOUNT int64[pyarrow]\n",
"CREATED datetime64[ns]\n",
"dtype: object\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>ORDERID</th>\n",
" <th>PAYMENTMETHOD</th>\n",
" <th>STATUS</th>\n",
" <th>AMOUNT</th>\n",
" <th>CREATED</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>credit_card</td>\n",
" <td>success</td>\n",
" <td>1000</td>\n",
" <td>2018-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>credit_card</td>\n",
" <td>success</td>\n",
" <td>2000</td>\n",
" <td>2018-01-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>coupon</td>\n",
" <td>success</td>\n",
" <td>100</td>\n",
" <td>2018-01-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>coupon</td>\n",
" <td>success</td>\n",
" <td>2500</td>\n",
" <td>2018-01-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>bank_transfer</td>\n",
" <td>fail</td>\n",
" <td>1700</td>\n",
" <td>2018-01-05</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID ORDERID PAYMENTMETHOD STATUS AMOUNT CREATED\n",
"0 1 1 credit_card success 1000 2018-01-01\n",
"1 2 2 credit_card success 2000 2018-01-02\n",
"2 3 3 coupon success 100 2018-01-04\n",
"3 4 4 coupon success 2500 2018-01-05\n",
"4 5 5 bank_transfer fail 1700 2018-01-05"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_pay = pd.read_csv('~/Downloads/stripe_payments.csv',\n",
" parse_dates=['CREATED'],\n",
" dtype_backend='pyarrow'\n",
" )\n",
"print(df_pay.dtypes)\n",
"df_pay.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"df_pay.to_parquet('stripe_payments.parquet',\n",
" engine='pyarrow',\n",
" use_deprecated_int96_timestamps=True,\n",
" # allow_truncated_timestamps=True,\n",
" index=False\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## orders"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ID int64[pyarrow]\n",
"USER_ID int64[pyarrow]\n",
"ORDER_DATE datetime64[ns]\n",
"STATUS string[pyarrow]\n",
"dtype: object"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_orders = pd.read_csv('~/Downloads/jaffle_shop_orders.csv',\n",
" parse_dates=['ORDER_DATE'],\n",
" dtype_backend='pyarrow'\n",
" )\n",
"\n",
"df_orders.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"df_orders.to_parquet('jaffle_shop_orders.parquet',\n",
" engine='pyarrow',\n",
" use_deprecated_int96_timestamps=True,\n",
" # allow_truncated_timestamps=True,\n",
" index=False)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID int64[pyarrow]\n",
"USER_ID int64[pyarrow]\n",
"ORDER_DATE datetime64[ns]\n",
"STATUS string[pyarrow]\n",
"dtype: object\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>USER_ID</th>\n",
" <th>ORDER_DATE</th>\n",
" <th>STATUS</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2018-01-01</td>\n",
" <td>returned</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2018-01-02</td>\n",
" <td>completed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>94</td>\n",
" <td>2018-01-04</td>\n",
" <td>completed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>50</td>\n",
" <td>2018-01-05</td>\n",
" <td>completed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>64</td>\n",
" <td>2018-01-05</td>\n",
" <td>completed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>95</td>\n",
" <td>27</td>\n",
" <td>2018-04-04</td>\n",
" <td>placed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>96</td>\n",
" <td>90</td>\n",
" <td>2018-04-06</td>\n",
" <td>placed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>97</td>\n",
" <td>89</td>\n",
" <td>2018-04-07</td>\n",
" <td>placed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>98</td>\n",
" <td>41</td>\n",
" <td>2018-04-07</td>\n",
" <td>placed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>99</td>\n",
" <td>85</td>\n",
" <td>2018-04-09</td>\n",
" <td>placed</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>99 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" ID USER_ID ORDER_DATE STATUS\n",
"0 1 1 2018-01-01 returned\n",
"1 2 3 2018-01-02 completed\n",
"2 3 94 2018-01-04 completed\n",
"3 4 50 2018-01-05 completed\n",
"4 5 64 2018-01-05 completed\n",
".. .. ... ... ...\n",
"94 95 27 2018-04-04 placed\n",
"95 96 90 2018-04-06 placed\n",
"96 97 89 2018-04-07 placed\n",
"97 98 41 2018-04-07 placed\n",
"98 99 85 2018-04-09 placed\n",
"\n",
"[99 rows x 4 columns]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new = pd.read_parquet('https://dbtlabsynapsedatalake.blob.core.windows.net/dbt-quickstart-public/jaffle_shop_orders.parquet')\n",
"print(df_new.dtypes)\n",
"df_new"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (dev)",
"language": "python",
"name": "dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment