Skip to content

Instantly share code, notes, and snippets.

@uribo
Created September 7, 2019 00:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uribo/932fda5c5713c486cd8ef8935a060ff2 to your computer and use it in GitHub Desktop.
Save uribo/932fda5c5713c486cd8ef8935a060ff2 to your computer and use it in GitHub Desktop.
キャッシュレス消費者還元事業登録リストのデータフレーム化
library(tidyverse)
library(tabulizer)
tweak_kameiten_df <- function(data) {
data %>%
dplyr::filter(!is.na(`No.`)) %>%
dplyr::mutate(還元率 = units::set_units(還元率, "%")) %>%
dplyr::arrange(`No.`) %>%
tibble::as_tibble()
}
kameiten_na <- function() {
tibble::tribble(
~No., ~都道府県, ~市区町村, ~事業所名_屋号, ~業種_大, ~業種_小, ~還元率,
NA_integer_, NA_character_, NA_character_, NA_character_ ,NA_character_, NA_character_, NA_real_
)
}
extract_kameiten <- function(path, start_page, end_page) {
df <-
tabulizer::extract_tables(file = path,
pages = seq.int(start_page, end_page),
output = "data.frame") %>%
purrr::map(~ janitor::remove_empty(.x, "cols"))
df_a <-
df %>%
purrr::keep(~ ncol(.x) == 6)
df_b <-
df %>%
purrr::keep(~ ncol(.x) == 5)
if (length(df_a) > 0) {
df_a <-
df_a %>%
purrr::map(~ tibble::as_tibble(.x) %>%
purrr::set_names(paste0("tmp", seq_len(6))) %>%
tidyr::separate(col = tmp1, into = c("No.", "都道府県"), sep = "[[:space:]]+")) %>%
purrr::reduce(rbind) %>%
purrr::set_names(c("No.", "都道府県", "市区町村", "事業所名_屋号", "業種_大", "業種_小", "還元率")) %>%
readr::type_convert(col_types = "icccccn")
} else {
df_a <-
kameiten_na()
}
if (length(df_b) > 0) {
df_b <-
df_b %>%
purrr::map(~ tibble::as_tibble(.x) %>%
tidyr::separate(col = `No..都道府県`, into = c("No.", "都道府県"), sep = "[[:space:]]+") %>%
dplyr::mutate(`事業所名_屋号` = stringr::str_replace(`事業所名.屋号.`, "(.*)[[:space:]](.+)", "\\1"),
`業種_大` = stringr::str_replace(`事業所名.屋号.`, ".*[[:space:]](.+)", "\\1"))) %>%
purrr::reduce(rbind) %>%
dplyr::rename(業種_小 = 業種) %>%
dplyr::select(`No.`, 都道府県, 市区町村, 事業所名_屋号, 業種_大, 業種_小, 還元率) %>%
readr::type_convert(col_types = "icccccn")
} else {
df_b <-
kameiten_na()
}
rbind(
df_a,
df_b) %>%
tweak_kameiten_df()
}
extract_kameiten_ec <- function(path, start_page, end_page) {
tabulizer::extract_tables(path,
pages = seq.int(start_page, end_page),
output = "data.frame") %>%
purrr::reduce(rbind) %>%
purrr::set_names(c("No.", "事業所名_屋号", "還元率")) %>%
readr::type_convert(col_types = "icn") %>%
tweak_kameiten_df()
}
pdf_path <- "kameiten_touroku_list.pdf"
# 1 固定店舗... 3 ~ 5536 ------------------------------------------------------
d01_kotei_tenpo <-
extract_kameiten(pdf_path, 3, 5536)
# 2 楽天市場 5537 ~ 5904 ----------------------------------------------------------------
d02_rakuten <-
extract_kameiten_ec(pdf_path, 5537, 5904)
# 3 Yahoo!ショッピング 5905 ~ 6245 ----------------------------------------------------------
d03_yahoo <-
extract_kameiten_ec(pdf_path, 5905, 6245)
# 4 その他ECサイト 6246 ~ 6360 --------------------------------------------------------------
d04_other <-
extract_kameiten_ec(pdf_path, 6246, 6360)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment