Skip to content

Instantly share code, notes, and snippets.

@irudnyts
Last active September 8, 2017 15:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save irudnyts/9919fd110dabeea41c12894f2275adf9 to your computer and use it in GitHub Desktop.
Save irudnyts/9919fd110dabeea41c12894f2275adf9 to your computer and use it in GitHub Desktop.
options(stringsAsFactors = FALSE)
library("rvest")
library("stringr")
rm(list = ls())
equal_lengths <- function(...) {
sizes <- sapply(X = list(...), FUN = length)
return(all(rep(sizes[1], length(sizes)) == sizes))
}
url_part1 <- "https://www.immobilienscout24.de/Suche/S-T/P-"
url_part2 <- "/Wohnung-Miete/Nordrhein-Westfalen/Dortmund?pagerReporting=true"
pages <- 1:45
urls <- paste0(url_part1, pages, url_part2)
property <- data.frame(price = character(),
area = character(),
rooms = character(),
address = character())
while(length(urls) > 0) {
link <- urls[1]
print(link)
immo <- read_html(link)
price <- immo %>%
html_nodes(".result-list-entry__primary-criterion:nth-child(1) .font-line-xs") %>%
html_text()
area <- immo %>%
html_nodes(".result-list-entry__primary-criterion:nth-child(2) .font-line-xs") %>%
html_text()
rooms <- immo %>%
html_nodes(".result-list-entry__primary-criterion:nth-child(3) .font-line-xs") %>%
html_text()
# add <- immo %>%
# html_nodes(".margin-bottom-xs") %>%
# html_text()
address <- immo %>%
html_nodes("#listings .link-underline") %>%
html_text()
if(equal_lengths(price, area, rooms, address)) {
property <- rbind(property,
data.frame(price, area, rooms, address))
urls <- urls[-1]
}
}
# tidy
property$price <- property$price %>%
gsub(pattern = " \u20AC", replacement = "", fixed = TRUE) %>%
gsub(pattern = ".", replacement = "", fixed = TRUE) %>%
gsub(pattern = ",", replacement = ".", fixed = TRUE) %>%
as.numeric()
property$area <- property$area %>%
gsub(pattern = " m\u00B2", replacement = "", fixed = TRUE) %>%
gsub(pattern = ",", replacement = ".", fixed = TRUE) %>%
as.numeric()
property$rooms <- property$rooms %>%
gsub(pattern = ",", replacement = ".", fixed = TRUE) %>%
as.numeric()
property$part <- str_match(string = property$address,
pattern = "(?<=, )(.+?),")[, 2]
property$part[is.na(property$part)] <-
gsub(pattern = ", Dortmund", "", property$address[is.na(property$part)])
# check for dublication and NA's
nrow(property)
# > 900
nrow(unique(property))
# > 889
sum(complete.cases(property))
# > 889
property <- property[!duplicated(property) & complete.cases(property), ]
nrow(property)
# > 888
write.csv(property, "dortmund.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment