Last active
February 18, 2021 20:12
-
-
Save tradingbills/6de54c66ea825e1b253584ef98b39ed7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Data Understanding (1-6) | |
#1. Industry data belongs to; 2. Observations and variables count: ; 3. Data types | |
#4. Levels: ; 5. Any negatives (when ecommerce data); 6 Find Nulls | |
#1 Industry data belongs to: | |
#2 Observations & Variable Count | |
```{r} | |
library(skimr); skim(df99) | |
library(psych); describe(df99) | |
summary(df99) | |
``` | |
#3. Find datatypes | |
```{r} | |
glimpse(df99) | |
df99 %>% | |
dplyr::select(where(is.factor)) %>% | |
glimpse() | |
find_datatype = function(df){ | |
sapply(df, FUN=function(col) class(col)) | |
} | |
find_datatype(df99) | |
``` | |
#4.a | |
```{r} | |
find_keys <- function(df){ | |
# get count of distincts per each column | |
df_distincts <- sapply(df, n_distinct) | |
# make tibble with of above distincts and colnames | |
full_count <- data.frame(colnames(df), tibble(df_distincts)) | |
colnames(full_count) <- c('field','cnt') | |
# is a column's unique count the same as the df row count | |
full_count$all_unique <- (full_count$cnt == nrow(df)) | |
full_count <- full_count[,c(1,3,2)] | |
return(full_count) | |
} | |
df_uniques <- find_keys(df99) | |
``` | |
#4.b Find Non-numeric Uniques | |
```{r} | |
find_unique = function(df){ | |
sapply(df, FUN=function(col) if(class(col)=="character"||class(col)=="integer") sort(unique(col)) else{class(col)}) | |
} | |
find_unique(df99) | |
``` | |
#4.c Find All Uniques & Coerce to factor | |
```{r} | |
#4.c.1 Selected variables | |
find_all_unique = function(df){ | |
sapply(df, FUN=function(col) sort(unique(col)) ) | |
} | |
#get names of df | |
names(df99) | |
#build a vector for re-use | |
cols_in_question <- c("Survived","Sex","SibSp","Embarked") | |
find_unique(df99[cols_in_question]) | |
#covert to factors | |
df99[cols_in_question] <- lapply(df99[cols_in_question], factor) | |
#4.c.2 Convert all characters to factors [optional] | |
mutate_if(is.character, factor) | |
#check | |
sapply(df99, class) | |
skim(df99) | |
``` | |
#6 Find Nulls | |
```{r} | |
#def count_missing: | |
count_missing = function(df){ | |
sapply(df, FUN=function(col) sum(is.na(col))) | |
} | |
#apply count_missing func to df | |
nacounts <- count_missing(df99) | |
hasNA = which(nacounts > 0) | |
#see results | |
nacounts[hasNA] | |
``` | |
#99 Optional Rename | |
```{r} | |
#reassign name | |
{new_name} <- df99 | |
#remove generic name | |
remove(df99) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment