erikgregorywebb/stack-overflow-scrape-clean.R

## stack-overflow-scrape-clean.R
rm(list=ls())
setwd("~/Documents/Python/so")

library(tidyverse)

# import
so = read.csv('so-salaries.csv', stringsAsFactors = F)

# clean
colnames(so) = c('Location', 'Education', 'Years of Experience',
                 '25th_Percentile', '50th_Percentile','75th_Percentile')
so$Education = ifelse(so$Education == 1, "Less than Bachelors",
                      ifelse(so$Education == 2, "Bachelors Degree",
                             ifelse(so$Education == 3, "Graduate Degree",
                                    ifelse(so$Education == 4, "Post-Graduate Degree", "None"))))
so$`25th_Percentile` = as.numeric(gsub(",", "", so$`25th_Percentile`))
so$`50th_Percentile` = as.numeric(gsub(",", "", so$`50th_Percentile`))
so$`75th_Percentile` = as.numeric(gsub(",", "", so$`75th_Percentile`))
so$Location = gsub(',.*$', '', so$Location)

# reshape
so = so %>%
  gather("Percentile", "Salary", -Location, -Education, -`Years of Experience`)
so$Percentile = substr(so$Percentile, 1, 4)

#export
write.csv(so, 'so-salaries-clean.csv', row.names = F)
	rm(list=ls())
	setwd("~/Documents/Python/so")

	library(tidyverse)

	# import
	so = read.csv('so-salaries.csv', stringsAsFactors = F)

	# clean
	colnames(so) = c('Location', 'Education', 'Years of Experience',
	'25th_Percentile', '50th_Percentile','75th_Percentile')
	so$Education = ifelse(so$Education == 1, "Less than Bachelors",
	ifelse(so$Education == 2, "Bachelors Degree",
	ifelse(so$Education == 3, "Graduate Degree",
	ifelse(so$Education == 4, "Post-Graduate Degree", "None"))))
	so$`25th_Percentile` = as.numeric(gsub(",", "", so$`25th_Percentile`))
	so$`50th_Percentile` = as.numeric(gsub(",", "", so$`50th_Percentile`))
	so$`75th_Percentile` = as.numeric(gsub(",", "", so$`75th_Percentile`))
	so$Location = gsub(',.*$', '', so$Location)

	# reshape
	so = so %>%
	gather("Percentile", "Salary", -Location, -Education, -`Years of Experience`)
	so$Percentile = substr(so$Percentile, 1, 4)

	#export
	write.csv(so, 'so-salaries-clean.csv', row.names = F)