graebnerc/#S12 - Recap script

## #S12 - Recap script
This contains the script developed during the recap session.

## recap.R
here::i_am("R/recap.R")
library(dplyr)
library(data.table)
library(here)
library(tidyr)
library(ggplot2)

# How to deal with large files - some hints-----------
gdp_file <- here("data/raw/API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_5359165.csv")
gdp_data_raw <- data.table::fread(file = gdp_file)

# Use overview functions:
head(gdp_data, n = 2)
names(gdp_data)
str(gdp_data)
dplyr::glimpse(gdp_data)

# Check the unique values of the columns:
unique(gdp_data$`Indicator Name`)
unique(gdp_data$V67)

# Taking into account all this information suggests to augment the import call:
gdp_data_raw <- data.table::fread(
  file = gdp_file,
  header = TRUE # To ensure the column names are correct
  ) %>%
  tibble::as_tibble(.) %>% # Facilitates printing
  select( # Remove redundant columns
    -c("Country Name",
       "Indicator Name", "Indicator Code",
       "V67")
  )

# Then continue working with the data:
gdp_data_tidy <- gdp_data_raw %>%
  tidyr::pivot_longer(
    cols = -"Country Code",
    names_to = "year",
    values_to = "GDP_percapita")

# Country codes--------------
library(countrycode)
gdp_data_countrynames <- gdp_data_tidy %>%
  dplyr::mutate(
    countryname = countrycode::countrycode(
    `Country Code`, origin = "iso3c", destination = "country.name")
    ) %>%
  dplyr::mutate(# For manual correction do, e.g.:
    countryname = ifelse(`Country Code` == "WLD", "World", countryname)
  )
head(gdp_data_countrynames)
# Check potential duplicates!

# Scatter plot---------------
wine_data <- DataScienceExercises::wine2dine

ggplot(
  data = wine_data,
  mapping = aes(
    y = `residual sugar`,
    x = alcohol,
    color = kind)
  ) +
  geom_point() +
  theme_bw()
	here::i_am("R/recap.R")
	library(dplyr)
	library(data.table)
	library(here)
	library(tidyr)
	library(ggplot2)

	# How to deal with large files - some hints-----------
	gdp_file <- here("data/raw/API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_5359165.csv")
	gdp_data_raw <- data.table::fread(file = gdp_file)

	# Use overview functions:
	head(gdp_data, n = 2)
	names(gdp_data)
	str(gdp_data)
	dplyr::glimpse(gdp_data)

	# Check the unique values of the columns:
	unique(gdp_data$`Indicator Name`)
	unique(gdp_data$V67)

	# Taking into account all this information suggests to augment the import call:
	gdp_data_raw <- data.table::fread(
	file = gdp_file,
	header = TRUE # To ensure the column names are correct
	) %>%
	tibble::as_tibble(.) %>% # Facilitates printing
	select( # Remove redundant columns
	-c("Country Name",
	"Indicator Name", "Indicator Code",
	"V67")
	)

	# Then continue working with the data:
	gdp_data_tidy <- gdp_data_raw %>%
	tidyr::pivot_longer(
	cols = -"Country Code",
	names_to = "year",
	values_to = "GDP_percapita")

	# Country codes--------------
	library(countrycode)
	gdp_data_countrynames <- gdp_data_tidy %>%
	dplyr::mutate(
	countryname = countrycode::countrycode(
	`Country Code`, origin = "iso3c", destination = "country.name")
	) %>%
	dplyr::mutate(# For manual correction do, e.g.:
	countryname = ifelse(`Country Code` == "WLD", "World", countryname)
	)
	head(gdp_data_countrynames)
	# Check potential duplicates!

	# Scatter plot---------------
	wine_data <- DataScienceExercises::wine2dine

	ggplot(
	data = wine_data,
	mapping = aes(
	y = `residual sugar`,
	x = alcohol,
	color = kind)
	) +
	geom_point() +
	theme_bw()