Skip to content

Instantly share code, notes, and snippets.

@krokrob
Created May 18, 2020 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save krokrob/0894875dc4b2fdfdc92c35cc574657e0 to your computer and use it in GitHub Desktop.
Save krokrob/0894875dc4b2fdfdc92c35cc574657e0 to your computer and use it in GitHub Desktop.
import os
import pandas as pd
class Olist:
def get_data(self):
"""
01-01 > This function returns all Olist datasets
as DataFrames within a Python dict.
"""
# Hint: You will need to find the absolute path of the csv folder in order to call this method from anywhere.
# Hint 2: look at python __file__ attribute
csv_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'csv')
csv_files = [path.replace('.csv', '') for path in os.listdir(csv_path) if path.endswith('.csv')]
data = {}
for name in csv_files:
data[name] = pd.read_csv(os.path.join(csv_path, f'{name}.csv'))
return data
def get_matching_table(self):
"""
01-01 > This function returns a matching table between
columns [ "order_id", "review_id", "customer_id", "product_id", "seller_id"]
"""
# Get data
data = self.get_data()
# Filter only on order_status delivered
orders = data['olist_orders_dataset']
orders = orders[orders['order_status']=='delivered']
reviews = data['olist_order_reviews_dataset']
order_items = data['olist_order_items_dataset']
# Select only the columns of interest
orders = orders[['order_id', 'customer_id']]
reviews = reviews[['review_id', 'order_id']]
order_items = order_items[['order_id', 'product_id', 'seller_id']]
# Merge DataFrame
merge_df = orders.merge(reviews, on='order_id', how='outer').merge(order_items, on='order_id', how='outer')
return merge_df
def ping(self):
"""
You call ping I print pong.
"""
print('pong')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment