Created
September 7, 2019 00:13
-
-
Save uribo/932fda5c5713c486cd8ef8935a060ff2 to your computer and use it in GitHub Desktop.
キャッシュレス消費者還元事業登録リストのデータフレーム化
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(tabulizer) | |
tweak_kameiten_df <- function(data) { | |
data %>% | |
dplyr::filter(!is.na(`No.`)) %>% | |
dplyr::mutate(還元率 = units::set_units(還元率, "%")) %>% | |
dplyr::arrange(`No.`) %>% | |
tibble::as_tibble() | |
} | |
kameiten_na <- function() { | |
tibble::tribble( | |
~No., ~都道府県, ~市区町村, ~事業所名_屋号, ~業種_大, ~業種_小, ~還元率, | |
NA_integer_, NA_character_, NA_character_, NA_character_ ,NA_character_, NA_character_, NA_real_ | |
) | |
} | |
extract_kameiten <- function(path, start_page, end_page) { | |
df <- | |
tabulizer::extract_tables(file = path, | |
pages = seq.int(start_page, end_page), | |
output = "data.frame") %>% | |
purrr::map(~ janitor::remove_empty(.x, "cols")) | |
df_a <- | |
df %>% | |
purrr::keep(~ ncol(.x) == 6) | |
df_b <- | |
df %>% | |
purrr::keep(~ ncol(.x) == 5) | |
if (length(df_a) > 0) { | |
df_a <- | |
df_a %>% | |
purrr::map(~ tibble::as_tibble(.x) %>% | |
purrr::set_names(paste0("tmp", seq_len(6))) %>% | |
tidyr::separate(col = tmp1, into = c("No.", "都道府県"), sep = "[[:space:]]+")) %>% | |
purrr::reduce(rbind) %>% | |
purrr::set_names(c("No.", "都道府県", "市区町村", "事業所名_屋号", "業種_大", "業種_小", "還元率")) %>% | |
readr::type_convert(col_types = "icccccn") | |
} else { | |
df_a <- | |
kameiten_na() | |
} | |
if (length(df_b) > 0) { | |
df_b <- | |
df_b %>% | |
purrr::map(~ tibble::as_tibble(.x) %>% | |
tidyr::separate(col = `No..都道府県`, into = c("No.", "都道府県"), sep = "[[:space:]]+") %>% | |
dplyr::mutate(`事業所名_屋号` = stringr::str_replace(`事業所名.屋号.`, "(.*)[[:space:]](.+)", "\\1"), | |
`業種_大` = stringr::str_replace(`事業所名.屋号.`, ".*[[:space:]](.+)", "\\1"))) %>% | |
purrr::reduce(rbind) %>% | |
dplyr::rename(業種_小 = 業種) %>% | |
dplyr::select(`No.`, 都道府県, 市区町村, 事業所名_屋号, 業種_大, 業種_小, 還元率) %>% | |
readr::type_convert(col_types = "icccccn") | |
} else { | |
df_b <- | |
kameiten_na() | |
} | |
rbind( | |
df_a, | |
df_b) %>% | |
tweak_kameiten_df() | |
} | |
extract_kameiten_ec <- function(path, start_page, end_page) { | |
tabulizer::extract_tables(path, | |
pages = seq.int(start_page, end_page), | |
output = "data.frame") %>% | |
purrr::reduce(rbind) %>% | |
purrr::set_names(c("No.", "事業所名_屋号", "還元率")) %>% | |
readr::type_convert(col_types = "icn") %>% | |
tweak_kameiten_df() | |
} | |
pdf_path <- "kameiten_touroku_list.pdf" | |
# 1 固定店舗... 3 ~ 5536 ------------------------------------------------------ | |
d01_kotei_tenpo <- | |
extract_kameiten(pdf_path, 3, 5536) | |
# 2 楽天市場 5537 ~ 5904 ---------------------------------------------------------------- | |
d02_rakuten <- | |
extract_kameiten_ec(pdf_path, 5537, 5904) | |
# 3 Yahoo!ショッピング 5905 ~ 6245 ---------------------------------------------------------- | |
d03_yahoo <- | |
extract_kameiten_ec(pdf_path, 5905, 6245) | |
# 4 その他ECサイト 6246 ~ 6360 -------------------------------------------------------------- | |
d04_other <- | |
extract_kameiten_ec(pdf_path, 6246, 6360) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment