GeorgeOduor/descriptive stats

## descriptive stats
---
title: "Basic Statistics"
author: "George"
date: "8/2/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Introduction

### Data

```{r libs}
suppressWarnings(suppressPackageStartupMessages(library(tidyverse)))
```

```{r data}
data(mpg)
```

### Descriptive Statistics.

```{r}
summary(mpg)
```

```{r}
names(mpg)
colnames(mpg)
```

```{r}
mpg %>% glimpse()
```

```{r}
dim(mpg)
```

```{r}
library(kableExtra)#make beatiful table
head(mpg)
# mpg %>% head() %>% kable() %>% kable_styling(full_width = F)

kable_styling(kable(head(mpg)))
```


### Univariate Analysis

Which manufacturer has the highest number of cars.

```{r}
mpg %>%
  group_by(manufacturer) %>%
  tally(sort = T) %>%
  dplyr::rename("Count" = "n") %>%
  ggplot(aes(x = reorder(manufacturer,Count),y = Count)) +
  geom_col()+coord_flip()
```
What is the model of the latest car?Which company produced it?

```{r}
subset1 = mpg[mpg$year == max(mpg$year),]
# table(subset1$manufacturer,subset1$model)/
library(vcd)
```

```{r}
mytable = table(Arthritis$Improved)
mytable
```


```{r}
prop = round(prop.table(mytable)*100,2)
prop
```


```{r}
mytable2 <- table(Arthritis$Sex,Arthritis$Improved)
mytable2
```

```{r propotions}
# props

```

```{r}
xtabs(~ Sex+Improved,data = Arthritis)
```

```{r}
addmargins(mytable2)
```


```{r}

```


## ggplot2
ls()
rm(list = ls())

# Dataframe

# Data Visualisations
library(tidyverse)
# ggplot syntax
data("mtcars")


ggplot(data =mtcars %>%mutate(cyl = paste0(cyl,"clynders")),
       aes(x = cyl,fill='red'))+
  labs(title = "Mtcars Cylinders",x="Cylinders",y="Count",
       subtitle = "Nigeria",caption = 'Source:Analytics Department 2021')+
  theme_gray() +  geom_bar() +
  theme(plot.title  = element_text(colour = "orange",size = "15",hjust = .5),
        plot.subtitle = element_text(colour = "orange",size = "10",hjust = .5),
        plot.caption = element_text(face = "italic",hjust = 1,size = 5),
        legend.position = 'none')


# distribution chart

mtcars %>%
  ggplot(aes(x=mpg))+
  geom_histogram(bins = 20,aes(fill=as.character(vs)))+
  facet_grid( . ~ cyl )


# corellation
mtcars %>%
  ggplot(aes(x=drat,y = wt))+
  geom_point(aes(color=as.character(vs)))+
  facet_wrap(.~cyl)

## r basiscs

age <- 29

height = 1.7

age.height = age/height

# vectors
student1 = c(age,height)
student2 = c(78,1.9)

large_vector = c(student1,student2,"Nairobi","Kampala")

# list
large_vector2 = list(student1,student2,"Nairobi","Kampala")

# access vector items
large_vector2[[1]][1]
large_vector[1]

# data types
# Numeric data types
# Character
# boolean
# Date

# Coercing
typeof(large_vector)
typeof(large_vector2)
# Basic string operations
toupper("George")
tolower("George")
paste("My name is","George",sep = "_")
paste0("My name is","George")
sprintf("Working with str %s",3445345)
#Control structures
# comparison
2 == 3
"hello" != "hell"
23 < 56
if (age < 29) {
  print("Age is less than 29")
} else{
  print("Age is greater  than 29")
}
num = 12
if (num > 5) {
  print('Bigger than 5')
}
if (num > 10) {
  print('Bigger than 10')
}
myvar = ifelse(num > 5,('Bigger than 5'),
               ifelse(num > 6,('Bigger than 6'),('Bigger than 5')))

for (i in student1) {
 print(i)
}

x = 56
while (x <= 56 ) {
  print("ddsff")
  x = x+10
}

## r basiscs3

golem::detach_all_attached()

library(tidyverse)
library(openxlsx)

# files and directories
# create adirectory
dir.create("E:/Classes")
dir.exists("E:/Classes")
setwd("E:/Classes")
# write excel file
EuStockMarkets = EuStockMarkets   %>%
  # as.data.frame() %>%
  mutate(savedate = Sys.Date())
write.xlsx(x = EuStockMarkets,"EuStockMarkets.xlsx")
from_folder = read.xlsx("EuStockMarkets.xlsx",detectDates = T)
readxl::read_excel("EuStockMarkets.xlsx")

my_simple_calc <- function() {
  while (T) {
    cat("What do u want to do ?
Select :
1. to add
2. to subtract
3. to multiply
")
    selection = as.numeric(readline("Selection:" ))

    num1 = as.numeric(readline("Value 1:" ))
    num2 = as.numeric(readline("Value 2:" ))

    if (selection == 1) {
      result = paste("The sum of ",num1,"and",num2,"is",num1 + num2)
    }else if(selection == 2) {
      result = paste("The deference between ",num1,"and",num2,"is",num1 - num2)
    }else if(selection  == 3){
      result = paste("The product of ",num1,"and",num2,"is",num1 * num2)
    }

    return(result)
    break
  }
}
my_simple_calc()

DEBUGING

## r_basics_data_frame
# create a simple calculator sumulation

# Dataframe

# rectangular format

- rows (observation)
- columns - > variables - measurable quantities ,numeric,categorical,ordinal

sampledataframe = data.frame(
  fname = c('George','Oduor'),
  lname = c('William','Jane')
  )
# inbuilt dataframes
mtcars
# basic operations
# shape
dim(mtcars)
# colnames
names(mtcars)
colnames(mtcars)
rownames(mtcars)
# filter
hp_above_200 = mtcars[mtcars$hp > 200,]
hp_above_100_less_6 = mtcars[mtcars$hp > 100 & mtcars$cyl < 6,]

library(tidyverse)
mtcars <- mtcars %>% as_tibble(rownames = "Cars")
# selecting columns
colnames(mtcars)
x_m_c = select(mtcars,c(Cars,mpg,disp,gear))
no_gears = select(mtcars,-gear)
# mutate
mutate(x_m_c,gear=paste(gear,"gears"),
       dispsq = disp ^ 2,
       disp_mpg = disp + mpg ,
       mpg_gear = paste(mpg,gear,sep = "_"))
x_m_c['dispsq'] = x_m_c$disp ^ 2 #base packege
# mutate_at(x_m_c,.vars = c('mpg','disp'),.funs = function(x)sqrt(x))

x_m_c

mutate(x_m_c,test = 5+5)

x_m_c %>%
  mutate(test = 5 + mpg) %>%
  filter(mpg > 20,!grepl(pattern = 'Toyota',x = Cars,ignore.case = T))
	---
	title: "Basic Statistics"
	author: "George"
	date: "8/2/2021"
	output: html_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	```

	## Introduction

	### Data

	```{r libs}
	suppressWarnings(suppressPackageStartupMessages(library(tidyverse)))
	```

	```{r data}
	data(mpg)
	```

	### Descriptive Statistics.

	```{r}
	summary(mpg)
	```

	```{r}
	names(mpg)
	colnames(mpg)
	```

	```{r}
	mpg %>% glimpse()
	```

	```{r}
	dim(mpg)
	```

	```{r}
	library(kableExtra)#make beatiful table
	head(mpg)
	# mpg %>% head() %>% kable() %>% kable_styling(full_width = F)

	kable_styling(kable(head(mpg)))
	```


	### Univariate Analysis

	Which manufacturer has the highest number of cars.

	```{r}
	mpg %>%
	group_by(manufacturer) %>%
	tally(sort = T) %>%
	dplyr::rename("Count" = "n") %>%
	ggplot(aes(x = reorder(manufacturer,Count),y = Count)) +
	geom_col()+coord_flip()
	```
	What is the model of the latest car?Which company produced it?

	```{r}
	subset1 = mpg[mpg$year == max(mpg$year),]
	# table(subset1$manufacturer,subset1$model)/
	library(vcd)
	```

	```{r}
	mytable = table(Arthritis$Improved)
	mytable
	```


	```{r}
	prop = round(prop.table(mytable)*100,2)
	prop
	```


	```{r}
	mytable2 <- table(Arthritis$Sex,Arthritis$Improved)
	mytable2
	```

	```{r propotions}
	# props

	```

	```{r}
	xtabs(~ Sex+Improved,data = Arthritis)
	```

	```{r}
	addmargins(mytable2)
	```


	```{r}

	```
	ls()
	rm(list = ls())

	# Dataframe

	# Data Visualisations
	library(tidyverse)
	# ggplot syntax
	data("mtcars")


	ggplot(data =mtcars %>%mutate(cyl = paste0(cyl,"clynders")),
	aes(x = cyl,fill='red'))+
	labs(title = "Mtcars Cylinders",x="Cylinders",y="Count",
	subtitle = "Nigeria",caption = 'Source:Analytics Department 2021')+
	theme_gray() + geom_bar() +
	theme(plot.title = element_text(colour = "orange",size = "15",hjust = .5),
	plot.subtitle = element_text(colour = "orange",size = "10",hjust = .5),
	plot.caption = element_text(face = "italic",hjust = 1,size = 5),
	legend.position = 'none')



	# distribution chart

	mtcars %>%
	ggplot(aes(x=mpg))+
	geom_histogram(bins = 20,aes(fill=as.character(vs)))+
	facet_grid( . ~ cyl )


	# corellation
	mtcars %>%
	ggplot(aes(x=drat,y = wt))+
	geom_point(aes(color=as.character(vs)))+
	facet_wrap(.~cyl)

	age <- 29

	height = 1.7

	age.height = age/height

	# vectors
	student1 = c(age,height)
	student2 = c(78,1.9)

	large_vector = c(student1,student2,"Nairobi","Kampala")

	# list
	large_vector2 = list(student1,student2,"Nairobi","Kampala")

	# access vector items
	large_vector2[[1]][1]
	large_vector[1]

	# data types
	# Numeric data types
	# Character
	# boolean
	# Date

	# Coercing
	typeof(large_vector)
	typeof(large_vector2)
	# Basic string operations
	toupper("George")
	tolower("George")
	paste("My name is","George",sep = "_")
	paste0("My name is","George")
	sprintf("Working with str %s",3445345)
	#Control structures
	# comparison
	2 == 3
	"hello" != "hell"
	23 < 56
	if (age < 29) {
	print("Age is less than 29")
	} else{
	print("Age is greater than 29")
	}
	num = 12
	if (num > 5) {
	print('Bigger than 5')
	}
	if (num > 10) {
	print('Bigger than 10')
	}
	myvar = ifelse(num > 5,('Bigger than 5'),
	ifelse(num > 6,('Bigger than 6'),('Bigger than 5')))

	for (i in student1) {
	print(i)
	}

	x = 56
	while (x <= 56 ) {
	print("ddsff")
	x = x+10
	}

	golem::detach_all_attached()

	library(tidyverse)
	library(openxlsx)

	# files and directories
	# create adirectory
	dir.create("E:/Classes")
	dir.exists("E:/Classes")
	setwd("E:/Classes")
	# write excel file
	EuStockMarkets = EuStockMarkets %>%
	# as.data.frame() %>%
	mutate(savedate = Sys.Date())
	write.xlsx(x = EuStockMarkets,"EuStockMarkets.xlsx")
	from_folder = read.xlsx("EuStockMarkets.xlsx",detectDates = T)
	readxl::read_excel("EuStockMarkets.xlsx")

	my_simple_calc <- function() {
	while (T) {
	cat("What do u want to do ?
	Select :
	1. to add
	2. to subtract
	3. to multiply
	")
	selection = as.numeric(readline("Selection:" ))

	num1 = as.numeric(readline("Value 1:" ))
	num2 = as.numeric(readline("Value 2:" ))

	if (selection == 1) {
	result = paste("The sum of ",num1,"and",num2,"is",num1 + num2)
	}else if(selection == 2) {
	result = paste("The deference between ",num1,"and",num2,"is",num1 - num2)
	}else if(selection == 3){
	result = paste("The product of ",num1,"and",num2,"is",num1 * num2)
	}

	return(result)
	break
	}
	}
	my_simple_calc()

	DEBUGING
	# create a simple calculator sumulation

	# Dataframe

	# rectangular format

	- rows (observation)
	- columns - > variables - measurable quantities ,numeric,categorical,ordinal

	sampledataframe = data.frame(
	fname = c('George','Oduor'),
	lname = c('William','Jane')
	)
	# inbuilt dataframes
	mtcars
	# basic operations
	# shape
	dim(mtcars)
	# colnames
	names(mtcars)
	colnames(mtcars)
	rownames(mtcars)
	# filter
	hp_above_200 = mtcars[mtcars$hp > 200,]
	hp_above_100_less_6 = mtcars[mtcars$hp > 100 & mtcars$cyl < 6,]

	library(tidyverse)
	mtcars <- mtcars %>% as_tibble(rownames = "Cars")
	# selecting columns
	colnames(mtcars)
	x_m_c = select(mtcars,c(Cars,mpg,disp,gear))
	no_gears = select(mtcars,-gear)
	# mutate
	mutate(x_m_c,gear=paste(gear,"gears"),
	dispsq = disp ^ 2,
	disp_mpg = disp + mpg ,
	mpg_gear = paste(mpg,gear,sep = "_"))
	x_m_c['dispsq'] = x_m_c$disp ^ 2 #base packege
	# mutate_at(x_m_c,.vars = c('mpg','disp'),.funs = function(x)sqrt(x))

	x_m_c

	mutate(x_m_c,test = 5+5)

	x_m_c %>%
	mutate(test = 5 + mpg) %>%
	filter(mpg > 20,!grepl(pattern = 'Toyota',x = Cars,ignore.case = T))