swuyts/Parse_wiki_World_Cup18.R Secret

## Parse_wiki_World_Cup18.R
library(tidyverse)
library(rvest)

# Read in the website
site <- read_html("https://en.wikipedia.org/wiki/2018_FIFA_World_Cup_squads")

# Parse website for player tables
players <- site %>%
  html_table(fill = T) %>%
  .[1:32] # Keep only the tables related to the 32 teams

# Parse website for team names
teams <- site %>%
  html_nodes("h3 .mw-headline") %>%
  html_text() %>%
  .[1:32] # keep only the first 32 hits

# Parse website for coach names
coaches <- site %>%
  html_nodes("h3+ p") %>%
  html_text() %>%
  .[1:32] %>% # Keep only the first 32 hits
  str_replace_all("Coach: ", "") %>% # Clean up the string
  str_trim() # remove leading whitespaces

# Parse website to figure out in which group the team competes
group <- site %>%
  html_nodes("h2 .mw-headline") %>%
  html_text() %>%
  .[1:8] %>% # Keep only the first 8 hits
  rep(4) %>% # Make the group vector match the team vector
  sort()

# Now that we have all of the tables separatly, let's combine them into one
table <- tibble(team = teams,
                         coach = coaches,
                         group = group,
                         player = players) %>%
  unnest() %>% # The players table was a list, we need to unnest this
  rename(position = `Pos.`) %>%
  mutate(position = str_sub(position, 2,3)) %>% # Fix parsing error
  rename(age = `Date of birth (age)`) %>%
  mutate(age = as.integer(str_sub(age,-4, -2)))
	library(tidyverse)
	library(rvest)

	# Read in the website
	site <- read_html("https://en.wikipedia.org/wiki/2018_FIFA_World_Cup_squads")

	# Parse website for player tables
	players <- site %>%
	html_table(fill = T) %>%
	.[1:32] # Keep only the tables related to the 32 teams

	# Parse website for team names
	teams <- site %>%
	html_nodes("h3 .mw-headline") %>%
	html_text() %>%
	.[1:32] # keep only the first 32 hits

	# Parse website for coach names
	coaches <- site %>%
	html_nodes("h3+ p") %>%
	html_text() %>%
	.[1:32] %>% # Keep only the first 32 hits
	str_replace_all("Coach: ", "") %>% # Clean up the string
	str_trim() # remove leading whitespaces

	# Parse website to figure out in which group the team competes
	group <- site %>%
	html_nodes("h2 .mw-headline") %>%
	html_text() %>%
	.[1:8] %>% # Keep only the first 8 hits
	rep(4) %>% # Make the group vector match the team vector
	sort()

	# Now that we have all of the tables separatly, let's combine them into one
	table <- tibble(team = teams,
	coach = coaches,
	group = group,
	player = players) %>%
	unnest() %>% # The players table was a list, we need to unnest this
	rename(position = `Pos.`) %>%
	mutate(position = str_sub(position, 2,3)) %>% # Fix parsing error
	rename(age = `Date of birth (age)`) %>%
	mutate(age = as.integer(str_sub(age,-4, -2)))