tonyelhabr/scrape-places.md

## scrape-places.md

      
    Raw
  

              scrape-places.md
            
          
    Setup.
library(googleway)
library(dplyr)
library(tibble)

DATA_DIR <- "path/to/dir"
dir.create(DATA_DIR, showWarnings = FALSE, recursive = TRUE)

API_KEY <- Sys.getenv("GOOGLE_PLACES_API_KEY")

raw_my_location_pairs <- read.csv(file.path(DATA_DIR, "google-buffer-coordinates.csv"))
my_location_pairs <- lapply(
  1:nrow(raw_my_location_pairs),
  function(i) {
    c(
      raw_my_location_pairs[i, "Y"],
      raw_my_location_pairs[i, "X"]
    )
  }
)
my_radius <- 1557 ## Radius in meters
my_types <- c("grocery_or_supermarket", "convenience_store")
Ugly loops so it's easier to understand for non-purrr people.
## tibble instead of just data.frame cuz 0-row data.frames have dumb behavior
places <- tibble::tibble()


for(j in seq_along(my_location_pairs)) {
  location_pair <- my_location_pairs[[j]]
  for (type in my_types) {
    last_page_token <- NULL
    keep_going <- TRUE
    i <- 1
    while (isTRUE(keep_going)) {
      ## save data so we don"t have to re-scrape everything if something breaks in the middle of the run.
      hash <- paste0(sprintf('%03.0f', j), "-", type, "-", i)
      cached_file_path <- file.path(DATA_DIR, paste0(hash, ".rds"))
      if (file.exists(cached_file_path)) {
        res <- readRDS(cached_file_path)
      } else {
        message(paste0("Loop ", i, " for type ", type, " and location pair ", j, "."))
        ## Randomly wait between 3 to 5 seconds to prevent rate limiting.
        Sys.sleep(runif(1, 3, 5))
        res <- googleway::google_places(
          location = location_pair,
          radius = my_radius,
          place_type = type,
          key = API_KEY,
          page_token = last_page_token
        )
        saveRDS(res, cached_file_path)
      }
      
      if (res$status == 'ZERO_RESULTS') {
        keep_going <- FALSE
      } else if (nrow(res$results) < 20) {
        keep_going <- FALSE
      }
      
      i <- i + 1
      res$results$OID_ <- j
      res$results$type <- type
      places <- dplyr::bind_rows(places, res$results)
      ## check to see when we have at least 2 of every location, then end the loop
      ##   next_page_token will never not be null, so if we don't end the loop ourselves
      ##   it will keep going forever
      place_counts <- places |>
        dplyr::filter(type == .env$type, OID_ == j) |>
        dplyr::count(place_id)
      
      n_dups <- place_counts |>
        dplyr::filter(n >= 2) |>
        nrow()
      
      if (n_dups == nrow(place_counts)) {
        keep_going <- FALSE
      }
      
      last_page_token <- res$next_page_token
    }
  }
}
Clean up.
distinct_places <- places |>
  dplyr::distinct(
    place_id,
    .keep_all = TRUE
  ) |>
  dplyr::left_join(
    raw_my_location_pairs |> 
      dplyr::select(
        OID_,
        X,
        Y
      ),
    by = join_by(OID_)
  ) |> 
  tidyr::unnest_wider(geometry) |> 
  tidyr::unnest_wider(location) |> 
  dplyr::transmute(
    OID_,
    X,
    Y,
    lng,
    lat,
    type,
    place_id,
    name,
    vicinity,
    price_level,
    rating,
    user_ratings_total,
    all_api_types = purrr::map_chr(types, \(.x) paste0(sort(unique(.x)), collapse = ',')),
    dplyr::across(all_api_types, \(.x) ifelse(.x == '', NA_character_, .x))
  )
readr::write_csv(distinct_places, file.path(DATA_DIR, "all_places.csv"), na = "")
distinct_places |> 
  dplyr::select(
    place_id,
    all_api_types
  ) |> 
  tidyr::separate_rows(all_api_types) |> 
  dplyr::distinct(
    place_id,
    all_api_types
  ) |> 
  dplyr::mutate(value = 1L) |> 
  tidyr::pivot_wider(
    names_from = all_api_types,
    values_from = value,
    values_fill = 0L
  ) |>
  readr::write_csv(file.path(DATA_DIR, "all_api_types.csv"), na = "")