Skip to content

Instantly share code, notes, and snippets.

@adamcone
Created April 26, 2016 21:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adamcone/47b1c9bac969e1621e3173057c46a3c1 to your computer and use it in GitHub Desktop.
Save adamcone/47b1c9bac969e1621e3173057c46a3c1 to your computer and use it in GitHub Desktop.
```{r, echo = TRUE, warning = FALSE, message = FALSE}
# set working directory first!
setwd("/Users/adamcone/Desktop/Visualization Project/data/filtered data (processed)")
library(dplyr)
library(chron)
# get sales data and convert to tbl
sales_tbl = tbl_df(read.csv("sales.txt",
header = FALSE,
sep = "",
stringsAsFactors = FALSE))
# name columns (roughly...)
names(sales_tbl) = c("Sale_ID",
"Clerk_ID",
"Customer_ID",
"Payment_Code",
"Is_Void",
"Sale_Total",
"Date_Started",
"Time_Started",
"Date_Ended",
"Time_Ended",
"CC_Trans",
"CC_Last4",
"CC_Name",
"CC_PNref",
"CC_Auth",
"CC_Brand",
"extra_column"
)
# keep only non-empty rows: Payment_Code = 0 is a voided transaction, empty
# fields and VISA indicate conversion artifacts
sales_tbl = filter(sales_tbl,
!Payment_Code %in% c("0", "", "VISA")
)
# get rid of Is_Void == 1, which represent voided transactions
sales_tbl = filter(sales_tbl,
Is_Void == 0)
# get rid of the single transaction with Sale_Totals >= $2,000. Correspond to
# Steven Lucy accounting transactions independent of standard commerce.
sales_tbl = filter(sales_tbl, Sale_Total < 2e4)
# get rid of all rows with Sale_Total < $0. There are 477 of them after I
# applied the other filters, and they total -2,204.66. According to Andy, these
# are abberant transactions and should be disregarded for my analysis. In fact,
# Andy recommended that I let Steven know about these transactions, as they may
# indicate theft. I'll keep that in mind.
sales_tbl = filter(sales_tbl, Sale_Total >= 0)
# get rid of all rows with Sale_Total < 0.02 based on my conversations with
# Andy. It seems that these transactions are mostly to re-run credit or debit
# cards, since the companies may not allow the same card to be run twice for
# the same transaction.
sales_tbl = filter(sales_tbl, Sale_Total > 0.01)
# keep only relevant columns
sales_tbl = select(sales_tbl,
Sale_ID,
Payment_Code,
Sale_Total,
Date_Started,
Time_Started
)
# convert column formats for data hygiene
# Sale_ID from factor to character
sales_tbl = mutate(sales_tbl, Sale_ID = as.character(Sale_ID))
# Payment_Code should be Payment_Type according to this table:
Payment_Type_tbl = tbl_df(data.frame(Payment_Code = as.character(1:5),
Payment_Type = as.factor(c("Cash",
"Check",
"Credit/Debit",
"Tab",
"Food Stamps"
)
),
stringsAsFactors = FALSE
)
)
# to join these tables by Payment_Code, convert Payment_Code in sales_tbl go
# from factor to character:
sales_tbl = mutate(sales_tbl, Payment_Code = as.character(Payment_Code))
# Next, left-join these tables
sales_tbl = left_join(sales_tbl, Payment_Type_tbl, by = "Payment_Code")
# get rid of Payment_Type_tbl
rm(Payment_Type_tbl)
# delete Payment_Code
sales_tbl = select(sales_tbl, -Payment_Code)
# next, I'll add a column with the combined date and time using chron
sales_tbl = mutate(sales_tbl,
DateTime = chron(Date_Started,
Time_Started,
format = c(dates = "y-m-d", times = "h:m:s"),
out.format = c(dates = "year-m-d", times = "h:m:s")
)
)
# now, get rid of Date_Started and Time_Started columns
sales_tbl = select(sales_tbl, -Date_Started, -Time_Started)
# next, I'll get the data for the sales items
# get sale_items data and convert to tbl
sale_items_tbl = tbl_df(read.csv("sale_items.txt", header = FALSE, sep = ""))
# name columns
names(sale_items_tbl) = c("ID",
"Sale_ID",
"Item_ID",
"Quantity",
"Unit_Cost",
"Cost",
"Tax",
"Item_Total",
"Is_Refund"
)
# keep only relevant columns
sale_items_tbl = select(sale_items_tbl,
Sale_ID,
Item_ID,
Quantity,
Unit_Cost,
Item_Total)
# get rid of Sale_ID == 0 (corresponds to no documented sales)
sale_items_tbl = filter(sale_items_tbl, Sale_ID != 0)
# convert Sale_ID and Item_ID from integer to character for data hygiene
sale_items_tbl = mutate(sale_items_tbl,
Sale_ID = as.character(Sale_ID),
Item_ID = as.character(Item_ID)
)
# upload another table to get the item names:
item_names_tbl = tbl_df(read.csv("items.txt",
sep = "\t",
header = FALSE,
stringsAsFactors = FALSE))
# name columns
names(item_names_tbl) = c("Item_ID", #int(11)
"Item_Name", #varchar(255)
"Price_ID", #int(11)
"Tax_Category_ID", #int(11)
"PLU", #varchar(5)
"Size", #decimal(8,4)
"Size_Unit_ID", #int(11)
"Count", #int(11)
"Count_Timestamp", #datetime
"Last_Manual_Count", #int(11)
"Last_Manual_Count_Timestamp", #datetime
"Is_Discontinued", #tinyint(1)
"Notes" #text
)
# keep only relevant columns
item_names_tbl = select(item_names_tbl,
Item_ID,
Item_Name)
# filter irrelevant rows
item_names_tbl = filter(item_names_tbl,
Item_Name != "")
# create a new table items_tbl that joins sale_items_tbl and item_names_tbl:
items_tbl = left_join(sale_items_tbl, item_names_tbl, by = "Item_ID")
# now, it appears that Item_ID = 807 is "tab payment". While that's
# interesting, it's not relevent to this data set, so I'll purge it now:
items_tbl = filter(items_tbl, Item_ID != "807")
rm(sale_items_tbl, item_names_tbl)
# Next, left_join sales_tbl and items_tbl for total sales data table
items_tbl = left_join(sales_tbl, items_tbl, by = "Sale_ID")
items_tbl = select(items_tbl,
DateTime,
Sale_ID,
Sale_Total,
Payment_Type,
Item_Name,
Quantity,
Unit_Cost,
Item_Total)
# get rid of Item_Name = "bulk order". There are 947 of these sales. Open
# Produce has seperate business arrangements with nearby co-ops, which is an
# artifact of being in the UC catchment. These sales are not broken down by
# item, follow a different pricing scheme than standard retail sales, and
# constitute a functionally sepearate business for Open Produce.
items_tbl = filter(items_tbl, Item_Name != "bulk order")
# finally, I want to build a data frame for just the independent sales:
sales_tbl = select(items_tbl,
DateTime,
Sale_ID,
Sale_Total,
Payment_Type
) %>%
distinct(.)
```
After generating this "clean" data, I wondered, considering I spent maybe 90% of my available time in my quest for this elusive data hygiene, how much difference my efforts made. So, I generated the following comparison table
```{r, echo = TRUE, warning = FALSE, message = FALSE, eval = TRUE}
load("/Users/adamcone/Desktop/Visualization Project/analysis/opdata.RData")
comparison_df = data.frame(Quantity = c("time covered (days)",
"total revenue ($),",
"# sales",
"# items sold",
"sales range ($)"
),
Uncleaned_Data = c(max(chron(sales_u$DateTime), na.rm = TRUE) - min(chron(sales_u$DateTime), na.rm = TRUE),
sum(sales_u$Sale_Total, na.rm = TRUE),
dim(sales_u)[1],
dim(items_u)[1],
max(sales_u$Sale_Total, na.rm = TRUE) - min(sales_u$Sale_Total, na.rm = TRUE)),
Cleaned_Data = c(max(sales_f$DateTime) - min(sales_f$DateTime),
sum(sales_f$Sale_Total),
dim(sales_f)[1],
dim(items_f)[1],
max(sales_f$Sale_Total) - min(sales_f$Sale_Total))
)
comparison_df = mutate(comparison_df,
Percent_Change = -round(-(Cleaned_Data - Uncleaned_Data)/Uncleaned_Data * 100)
)
comparison_df
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment