tradingbills/R Data Understanding

## R Data Understanding
Data Understanding (1-6)
#1. Industry data belongs to; 2. Observations and variables count: ; 3. Data types
#4. Levels: ; 5. Any negatives (when ecommerce data); 6 Find Nulls

#1 Industry data belongs to:
#2 Observations & Variable Count
```{r}
library(skimr); skim(df99)
library(psych); describe(df99)
summary(df99)
```
#3. Find datatypes
```{r}
glimpse(df99)
df99 %>%
  dplyr::select(where(is.factor)) %>%
glimpse()

find_datatype = function(df){
  sapply(df, FUN=function(col) class(col))
}
find_datatype(df99)
```
#4.a
```{r}
find_keys <- function(df){
  # get count of distincts per each column
  df_distincts <- sapply(df, n_distinct)
  # make tibble with of above distincts and colnames
  full_count <- data.frame(colnames(df), tibble(df_distincts))
  colnames(full_count) <- c('field','cnt')
  # is a column's unique count the same as the df row count
  full_count$all_unique <- (full_count$cnt == nrow(df))
  full_count <- full_count[,c(1,3,2)]

  return(full_count)
}
df_uniques <- find_keys(df99)
```
#4.b Find Non-numeric Uniques
```{r}
find_unique = function(df){
  sapply(df, FUN=function(col) if(class(col)=="character"||class(col)=="integer") sort(unique(col)) else{class(col)})
}
find_unique(df99)
```
#4.c Find All Uniques & Coerce to factor
```{r}
#4.c.1 Selected variables
find_all_unique = function(df){
  sapply(df, FUN=function(col) sort(unique(col)) )
}
#get names of df
names(df99)
#build a vector for re-use
cols_in_question <- c("Survived","Sex","SibSp","Embarked")
find_unique(df99[cols_in_question])
#covert to factors
df99[cols_in_question] <- lapply(df99[cols_in_question], factor)
#4.c.2 Convert all characters to factors [optional]
mutate_if(is.character, factor)
#check
sapply(df99, class)
skim(df99)
```
#6 Find Nulls
```{r}
#def count_missing:
count_missing = function(df){
  sapply(df, FUN=function(col) sum(is.na(col)))
}
#apply count_missing func to df
nacounts <- count_missing(df99)
hasNA = which(nacounts > 0)
#see results
nacounts[hasNA]
```
  #99 Optional Rename
  ```{r}
  #reassign name
  {new_name} <- df99
  #remove generic name
  remove(df99)
  ```
	Data Understanding (1-6)
	#1. Industry data belongs to; 2. Observations and variables count: ; 3. Data types
	#4. Levels: ; 5. Any negatives (when ecommerce data); 6 Find Nulls

	#1 Industry data belongs to:
	#2 Observations & Variable Count
	```{r}
	library(skimr); skim(df99)
	library(psych); describe(df99)
	summary(df99)
	```
	#3. Find datatypes
	```{r}
	glimpse(df99)
	df99 %>%
	dplyr::select(where(is.factor)) %>%
	glimpse()

	find_datatype = function(df){
	sapply(df, FUN=function(col) class(col))
	}
	find_datatype(df99)
	```
	#4.a
	```{r}
	find_keys <- function(df){
	# get count of distincts per each column
	df_distincts <- sapply(df, n_distinct)
	# make tibble with of above distincts and colnames
	full_count <- data.frame(colnames(df), tibble(df_distincts))
	colnames(full_count) <- c('field','cnt')
	# is a column's unique count the same as the df row count
	full_count$all_unique <- (full_count$cnt == nrow(df))
	full_count <- full_count[,c(1,3,2)]

	return(full_count)
	}
	df_uniques <- find_keys(df99)
	```
	#4.b Find Non-numeric Uniques
	```{r}
	find_unique = function(df){
	sapply(df, FUN=function(col) if(class(col)=="character"\|\|class(col)=="integer") sort(unique(col)) else{class(col)})
	}
	find_unique(df99)
	```
	#4.c Find All Uniques & Coerce to factor
	```{r}
	#4.c.1 Selected variables
	find_all_unique = function(df){
	sapply(df, FUN=function(col) sort(unique(col)) )
	}
	#get names of df
	names(df99)
	#build a vector for re-use
	cols_in_question <- c("Survived","Sex","SibSp","Embarked")
	find_unique(df99[cols_in_question])
	#covert to factors
	df99[cols_in_question] <- lapply(df99[cols_in_question], factor)
	#4.c.2 Convert all characters to factors [optional]
	mutate_if(is.character, factor)
	#check
	sapply(df99, class)
	skim(df99)
	```
	#6 Find Nulls
	```{r}
	#def count_missing:
	count_missing = function(df){
	sapply(df, FUN=function(col) sum(is.na(col)))
	}
	#apply count_missing func to df
	nacounts <- count_missing(df99)
	hasNA = which(nacounts > 0)
	#see results
	nacounts[hasNA]
	```
	#99 Optional Rename
	```{r}
	#reassign name
	{new_name} <- df99
	#remove generic name
	remove(df99)
	```