Skip to content

Instantly share code, notes, and snippets.

@kadnan
Created December 10, 2019 17:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kadnan/b6e33532e85472467f5a5d8feac75e89 to your computer and use it in GitHub Desktop.
Save kadnan/b6e33532e85472467f5a5d8feac75e89 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"# For Ref: https://www.kaggle.com/admond1994/e-commerce-data-eda/data\n",
"# All Imports\n",
"import pandas as pd \n",
"import numpy as np\n",
"import sqlalchemy"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"connection_string = 'mysql+pymysql://root:root@localhost/superset_ecommerce'\n",
"conn = sqlalchemy.create_engine(connection_string)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>InvoiceNo</th>\n",
" <th>StockCode</th>\n",
" <th>Description</th>\n",
" <th>Quantity</th>\n",
" <th>InvoiceDate</th>\n",
" <th>UnitPrice</th>\n",
" <th>CustomerID</th>\n",
" <th>Country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>536365</td>\n",
" <td>85123A</td>\n",
" <td>WHITE HANGING HEART T-LIGHT HOLDER</td>\n",
" <td>6</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>2.55</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>536365</td>\n",
" <td>71053</td>\n",
" <td>WHITE METAL LANTERN</td>\n",
" <td>6</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>536365</td>\n",
" <td>84406B</td>\n",
" <td>CREAM CUPID HEARTS COAT HANGER</td>\n",
" <td>8</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>2.75</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>536365</td>\n",
" <td>84029G</td>\n",
" <td>KNITTED UNION FLAG HOT WATER BOTTLE</td>\n",
" <td>6</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>536365</td>\n",
" <td>84029E</td>\n",
" <td>RED WOOLLY HOTTIE WHITE HEART.</td>\n",
" <td>6</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" InvoiceNo StockCode Description Quantity \\\n",
"0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \n",
"1 536365 71053 WHITE METAL LANTERN 6 \n",
"2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n",
"3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n",
"4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n",
"\n",
" InvoiceDate UnitPrice CustomerID Country \n",
"0 12/1/2010 8:26 2.55 17850.0 United Kingdom \n",
"1 12/1/2010 8:26 3.39 17850.0 United Kingdom \n",
"2 12/1/2010 8:26 2.75 17850.0 United Kingdom \n",
"3 12/1/2010 8:26 3.39 17850.0 United Kingdom \n",
"4 12/1/2010 8:26 3.39 17850.0 United Kingdom "
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('data.csv', encoding = 'ISO-8859-1')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"#Rockset Operation\n",
"df.rename(index=str, columns={'InvoiceNo': 'invoice_num',\n",
" 'StockCode' : 'stock_code',\n",
" 'Description' : 'description',\n",
" 'Quantity' : 'quantity',\n",
" 'InvoiceDate' : 'invoice_date',\n",
" 'UnitPrice' : 'unit_price',\n",
" 'CustomerID' : 'customer_id',\n",
" 'Country' : 'country'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>invoice_num</th>\n",
" <th>stock_code</th>\n",
" <th>description</th>\n",
" <th>quantity</th>\n",
" <th>invoice_date</th>\n",
" <th>unit_price</th>\n",
" <th>customer_id</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>536365</td>\n",
" <td>85123A</td>\n",
" <td>WHITE HANGING HEART T-LIGHT HOLDER</td>\n",
" <td>6</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>2.55</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>536365</td>\n",
" <td>71053</td>\n",
" <td>WHITE METAL LANTERN</td>\n",
" <td>6</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>536365</td>\n",
" <td>84406B</td>\n",
" <td>CREAM CUPID HEARTS COAT HANGER</td>\n",
" <td>8</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>2.75</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>536365</td>\n",
" <td>84029G</td>\n",
" <td>KNITTED UNION FLAG HOT WATER BOTTLE</td>\n",
" <td>6</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>536365</td>\n",
" <td>84029E</td>\n",
" <td>RED WOOLLY HOTTIE WHITE HEART.</td>\n",
" <td>6</td>\n",
" <td>12/1/2010 8:26</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" invoice_num stock_code description quantity \\\n",
"0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \n",
"1 536365 71053 WHITE METAL LANTERN 6 \n",
"2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n",
"3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n",
"4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n",
"\n",
" invoice_date unit_price customer_id country \n",
"0 12/1/2010 8:26 2.55 17850.0 United Kingdom \n",
"1 12/1/2010 8:26 3.39 17850.0 United Kingdom \n",
"2 12/1/2010 8:26 2.75 17850.0 United Kingdom \n",
"3 12/1/2010 8:26 3.39 17850.0 United Kingdom \n",
"4 12/1/2010 8:26 3.39 17850.0 United Kingdom "
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 541909 entries, 0 to 541908\n",
"Data columns (total 8 columns):\n",
"invoice_num 541909 non-null object\n",
"stock_code 541909 non-null object\n",
"description 540455 non-null object\n",
"quantity 541909 non-null int64\n",
"invoice_date 541909 non-null object\n",
"unit_price 541909 non-null float64\n",
"customer_id 406829 non-null float64\n",
"country 541909 non-null object\n",
"dtypes: float64(2), int64(1), object(5)\n",
"memory usage: 37.2+ MB\n"
]
}
],
"source": [
"# Data Cleaning\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"customer_id 135080\n",
"description 1454\n",
"country 0\n",
"unit_price 0\n",
"invoice_date 0\n",
"quantity 0\n",
"stock_code 0\n",
"invoice_num 0\n",
"dtype: int64"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check missing values for each column \n",
"df.isnull().sum().sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>invoice_num</th>\n",
" <th>stock_code</th>\n",
" <th>description</th>\n",
" <th>quantity</th>\n",
" <th>invoice_date</th>\n",
" <th>unit_price</th>\n",
" <th>customer_id</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>622</th>\n",
" <td>536414</td>\n",
" <td>22139</td>\n",
" <td>NaN</td>\n",
" <td>56</td>\n",
" <td>12/1/2010 11:52</td>\n",
" <td>0.00</td>\n",
" <td>NaN</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1443</th>\n",
" <td>536544</td>\n",
" <td>21773</td>\n",
" <td>DECORATIVE ROSE BATHROOM BOTTLE</td>\n",
" <td>1</td>\n",
" <td>12/1/2010 14:32</td>\n",
" <td>2.51</td>\n",
" <td>NaN</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1444</th>\n",
" <td>536544</td>\n",
" <td>21774</td>\n",
" <td>DECORATIVE CATS BATHROOM BOTTLE</td>\n",
" <td>2</td>\n",
" <td>12/1/2010 14:32</td>\n",
" <td>2.51</td>\n",
" <td>NaN</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1445</th>\n",
" <td>536544</td>\n",
" <td>21786</td>\n",
" <td>POLKADOT RAIN HAT</td>\n",
" <td>4</td>\n",
" <td>12/1/2010 14:32</td>\n",
" <td>0.85</td>\n",
" <td>NaN</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1446</th>\n",
" <td>536544</td>\n",
" <td>21787</td>\n",
" <td>RAIN PONCHO RETROSPOT</td>\n",
" <td>2</td>\n",
" <td>12/1/2010 14:32</td>\n",
" <td>1.66</td>\n",
" <td>NaN</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" invoice_num stock_code description quantity \\\n",
"622 536414 22139 NaN 56 \n",
"1443 536544 21773 DECORATIVE ROSE BATHROOM BOTTLE 1 \n",
"1444 536544 21774 DECORATIVE CATS BATHROOM BOTTLE 2 \n",
"1445 536544 21786 POLKADOT RAIN HAT 4 \n",
"1446 536544 21787 RAIN PONCHO RETROSPOT 2 \n",
"\n",
" invoice_date unit_price customer_id country \n",
"622 12/1/2010 11:52 0.00 NaN United Kingdom \n",
"1443 12/1/2010 14:32 2.51 NaN United Kingdom \n",
"1444 12/1/2010 14:32 2.51 NaN United Kingdom \n",
"1445 12/1/2010 14:32 0.85 NaN United Kingdom \n",
"1446 12/1/2010 14:32 1.66 NaN United Kingdom "
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check out the rows with missing values\n",
"df[df.isnull().any(axis=1)].head()"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"# change the invoice_date format - String to Timestamp format\n",
"df['invoice_date'] = pd.to_datetime(df.invoice_date, format='%m/%d/%Y %H:%M')"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"# change description - UPPER case to LOWER case\n",
"df['description'] = df.description.str.lower() #Rockset Operation"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>invoice_num</th>\n",
" <th>stock_code</th>\n",
" <th>description</th>\n",
" <th>quantity</th>\n",
" <th>invoice_date</th>\n",
" <th>unit_price</th>\n",
" <th>customer_id</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>536365</td>\n",
" <td>85123A</td>\n",
" <td>white hanging heart t-light holder</td>\n",
" <td>6</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>2.55</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>536365</td>\n",
" <td>71053</td>\n",
" <td>white metal lantern</td>\n",
" <td>6</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>536365</td>\n",
" <td>84406B</td>\n",
" <td>cream cupid hearts coat hanger</td>\n",
" <td>8</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>2.75</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>536365</td>\n",
" <td>84029G</td>\n",
" <td>knitted union flag hot water bottle</td>\n",
" <td>6</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>536365</td>\n",
" <td>84029E</td>\n",
" <td>red woolly hottie white heart.</td>\n",
" <td>6</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>3.39</td>\n",
" <td>17850.0</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" invoice_num stock_code description quantity \\\n",
"0 536365 85123A white hanging heart t-light holder 6 \n",
"1 536365 71053 white metal lantern 6 \n",
"2 536365 84406B cream cupid hearts coat hanger 8 \n",
"3 536365 84029G knitted union flag hot water bottle 6 \n",
"4 536365 84029E red woolly hottie white heart. 6 \n",
"\n",
" invoice_date unit_price customer_id country \n",
"0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom \n",
"1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n",
"2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom \n",
"3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n",
"4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom "
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"country 0\n",
"customer_id 0\n",
"unit_price 0\n",
"invoice_date 0\n",
"quantity 0\n",
"description 0\n",
"stock_code 0\n",
"invoice_num 0\n",
"dtype: int64"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# df_new without missing values\n",
"df_new = df.dropna() #Rockset\n",
"# check missing values for each column \n",
"df_new.isnull().sum().sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 406829 entries, 0 to 541908\n",
"Data columns (total 8 columns):\n",
"invoice_num 406829 non-null object\n",
"stock_code 406829 non-null object\n",
"description 406829 non-null object\n",
"quantity 406829 non-null int64\n",
"invoice_date 406829 non-null datetime64[ns]\n",
"unit_price 406829 non-null float64\n",
"customer_id 406829 non-null float64\n",
"country 406829 non-null object\n",
"dtypes: datetime64[ns](1), float64(2), int64(1), object(4)\n",
"memory usage: 27.9+ MB\n"
]
}
],
"source": [
"df_new.info()"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/AdnanAhmad/Data/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" \n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>invoice_num</th>\n",
" <th>stock_code</th>\n",
" <th>description</th>\n",
" <th>quantity</th>\n",
" <th>invoice_date</th>\n",
" <th>unit_price</th>\n",
" <th>customer_id</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>536365</td>\n",
" <td>85123A</td>\n",
" <td>white hanging heart t-light holder</td>\n",
" <td>6</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>2.55</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>536365</td>\n",
" <td>71053</td>\n",
" <td>white metal lantern</td>\n",
" <td>6</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>3.39</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>536365</td>\n",
" <td>84406B</td>\n",
" <td>cream cupid hearts coat hanger</td>\n",
" <td>8</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>2.75</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>536365</td>\n",
" <td>84029G</td>\n",
" <td>knitted union flag hot water bottle</td>\n",
" <td>6</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>3.39</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>536365</td>\n",
" <td>84029E</td>\n",
" <td>red woolly hottie white heart.</td>\n",
" <td>6</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>3.39</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" invoice_num stock_code description quantity \\\n",
"0 536365 85123A white hanging heart t-light holder 6 \n",
"1 536365 71053 white metal lantern 6 \n",
"2 536365 84406B cream cupid hearts coat hanger 8 \n",
"3 536365 84029G knitted union flag hot water bottle 6 \n",
"4 536365 84029E red woolly hottie white heart. 6 \n",
"\n",
" invoice_date unit_price customer_id country \n",
"0 2010-12-01 08:26:00 2.55 17850 United Kingdom \n",
"1 2010-12-01 08:26:00 3.39 17850 United Kingdom \n",
"2 2010-12-01 08:26:00 2.75 17850 United Kingdom \n",
"3 2010-12-01 08:26:00 3.39 17850 United Kingdom \n",
"4 2010-12-01 08:26:00 3.39 17850 United Kingdom "
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# change columns tyoe - String to Int type \n",
"df_new['customer_id'] = df_new['customer_id'].astype('int64') #Rockset Operation\n",
"df_new.head()"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quantity</th>\n",
" <th>unit_price</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>406829.00</td>\n",
" <td>406829.00</td>\n",
" <td>406829.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>12.06</td>\n",
" <td>3.46</td>\n",
" <td>15287.69</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>248.69</td>\n",
" <td>69.32</td>\n",
" <td>1713.60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-80995.00</td>\n",
" <td>0.00</td>\n",
" <td>12346.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>2.00</td>\n",
" <td>1.25</td>\n",
" <td>13953.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>5.00</td>\n",
" <td>1.95</td>\n",
" <td>15152.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>12.00</td>\n",
" <td>3.75</td>\n",
" <td>16791.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>80995.00</td>\n",
" <td>38970.00</td>\n",
" <td>18287.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quantity unit_price customer_id\n",
"count 406829.00 406829.00 406829.00\n",
"mean 12.06 3.46 15287.69\n",
"std 248.69 69.32 1713.60\n",
"min -80995.00 0.00 12346.00\n",
"25% 2.00 1.25 13953.00\n",
"50% 5.00 1.95 15152.00\n",
"75% 12.00 3.75 16791.00\n",
"max 80995.00 38970.00 18287.00"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.describe().round(2) #Rockset Operation"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quantity</th>\n",
" <th>unit_price</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>397924.00</td>\n",
" <td>397924.00</td>\n",
" <td>397924.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>13.02</td>\n",
" <td>3.12</td>\n",
" <td>15294.32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>180.42</td>\n",
" <td>22.10</td>\n",
" <td>1713.17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>12346.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>2.00</td>\n",
" <td>1.25</td>\n",
" <td>13969.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>6.00</td>\n",
" <td>1.95</td>\n",
" <td>15159.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>12.00</td>\n",
" <td>3.75</td>\n",
" <td>16795.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>80995.00</td>\n",
" <td>8142.75</td>\n",
" <td>18287.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quantity unit_price customer_id\n",
"count 397924.00 397924.00 397924.00\n",
"mean 13.02 3.12 15294.32\n",
"std 180.42 22.10 1713.17\n",
"min 1.00 0.00 12346.00\n",
"25% 2.00 1.25 13969.00\n",
"50% 6.00 1.95 15159.00\n",
"75% 12.00 3.75 16795.00\n",
"max 80995.00 8142.75 18287.00"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Remove Quantity with negative values\n",
"df_new = df_new[df_new.quantity > 0] #Rockset Operation\n",
"df_new.describe().round(2)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"#Add the column - amount_spent\n",
"df_new['amount_spent'] = df_new['quantity'] * df_new['unit_price'] #Rockset Operation"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"# rearrange all the columns for easy reference\n",
"df_new = df_new[['invoice_num','invoice_date','stock_code',\n",
" 'description','quantity','unit_price','amount_spent',\n",
" 'customer_id','country'\n",
" ]]"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"#Rockset Operation\n",
"df_new.insert(loc=2, column='year_month', value=df_new['invoice_date'].map(lambda x: 100*x.year + x.month))\n",
"df_new.insert(loc=3, column='month', value=df_new.invoice_date.dt.month)\n",
"# +1 to make Monday=1.....until Sunday=7\n",
"df_new.insert(loc=4, column='day', value=(df_new.invoice_date.dt.dayofweek)+1)\n",
"df_new.insert(loc=5, column='hour', value=df_new.invoice_date.dt.hour)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"# Replace Country Names with text. Needed to feed Supset Column\n",
"def get_country_code(row):\n",
" if row['country'] == 'United Kingdom':\n",
" return 'GBR'\n",
"\n",
" if row['country'] == 'France':\n",
" return 'FRA'\n",
" \n",
" if row['country'] == 'Germany':\n",
" return 'DEU'\n",
" \n",
" if row['country'] == 'EIRE':\n",
" return 'IRL'\n",
" \n",
" if row['country'] == 'Spain':\n",
" return 'ESP'\n",
"\n",
"df_new['country_code'] = df_new.apply (lambda row: get_country_code(row), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>invoice_num</th>\n",
" <th>invoice_date</th>\n",
" <th>year_month</th>\n",
" <th>month</th>\n",
" <th>day</th>\n",
" <th>hour</th>\n",
" <th>stock_code</th>\n",
" <th>description</th>\n",
" <th>quantity</th>\n",
" <th>unit_price</th>\n",
" <th>amount_spent</th>\n",
" <th>customer_id</th>\n",
" <th>country</th>\n",
" <th>country_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>536365</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>201012</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>85123A</td>\n",
" <td>white hanging heart t-light holder</td>\n",
" <td>6</td>\n",
" <td>2.55</td>\n",
" <td>15.30</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" <td>GBR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>536365</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>201012</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>71053</td>\n",
" <td>white metal lantern</td>\n",
" <td>6</td>\n",
" <td>3.39</td>\n",
" <td>20.34</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" <td>GBR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>536365</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>201012</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>84406B</td>\n",
" <td>cream cupid hearts coat hanger</td>\n",
" <td>8</td>\n",
" <td>2.75</td>\n",
" <td>22.00</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" <td>GBR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>536365</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>201012</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>84029G</td>\n",
" <td>knitted union flag hot water bottle</td>\n",
" <td>6</td>\n",
" <td>3.39</td>\n",
" <td>20.34</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" <td>GBR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>536365</td>\n",
" <td>2010-12-01 08:26:00</td>\n",
" <td>201012</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>84029E</td>\n",
" <td>red woolly hottie white heart.</td>\n",
" <td>6</td>\n",
" <td>3.39</td>\n",
" <td>20.34</td>\n",
" <td>17850</td>\n",
" <td>United Kingdom</td>\n",
" <td>GBR</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" invoice_num invoice_date year_month month day hour stock_code \\\n",
"0 536365 2010-12-01 08:26:00 201012 12 3 8 85123A \n",
"1 536365 2010-12-01 08:26:00 201012 12 3 8 71053 \n",
"2 536365 2010-12-01 08:26:00 201012 12 3 8 84406B \n",
"3 536365 2010-12-01 08:26:00 201012 12 3 8 84029G \n",
"4 536365 2010-12-01 08:26:00 201012 12 3 8 84029E \n",
"\n",
" description quantity unit_price amount_spent \\\n",
"0 white hanging heart t-light holder 6 2.55 15.30 \n",
"1 white metal lantern 6 3.39 20.34 \n",
"2 cream cupid hearts coat hanger 8 2.75 22.00 \n",
"3 knitted union flag hot water bottle 6 3.39 20.34 \n",
"4 red woolly hottie white heart. 6 3.39 20.34 \n",
"\n",
" customer_id country country_code \n",
"0 17850 United Kingdom GBR \n",
"1 17850 United Kingdom GBR \n",
"2 17850 United Kingdom GBR \n",
"3 17850 United Kingdom GBR \n",
"4 17850 United Kingdom GBR "
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.head()"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"# Dump Data into Table\n",
"#df_new.to_sql('sales_new', conn, if_exists='replace', index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment