Created
March 21, 2023 12:34
-
-
Save tukachev/62d4947e3025e3ae9b8664d1bbb20694 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# remotes::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer"), INSTALL_opts = "--no-multiarch") | |
library(tabulizer) | |
library(rvest) | |
library(googlesheets4) | |
url <- "https://minjust.gov.ru/ru/activity/directions/942/" | |
links <- read_html(url) %>% | |
html_elements("#section-description > div a") %>% | |
html_attr("href") | |
pdf_url <- | |
glue::glue( | |
"https://minjust.gov.ru{links[str_which(links, | |
'reestr-inostrannyih-agentov')]}" | |
) | |
download.file( | |
pdf_url, | |
destfile = "reestr-inostrannyih-agentov.pdf", | |
mode = "wb", | |
headers = c("User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36") | |
) | |
get_n_pages("reestr-inostrannyih-agentov.pdf") | |
metadata <- extract_metadata("reestr-inostrannyih-agentov.pdf") | |
Sys.setlocale("LC_TIME", 'en_US.UTF-8') | |
date_update <- lubridate::as_date(strptime(gsub(".YEKT", "", metadata$created), format = "%a %b %d %T %Y")) | |
extract_tables( | |
# glue::glue("reestr_{format(date_update, '%d_%m_%Y')}.pdf"), | |
"reestr-inostrannyih-agentov.pdf", | |
# pages = 1, | |
output = "csv", | |
method = "lattice", | |
outdir = here::here("reestr_temp") | |
) | |
df <- | |
list.files(path = here::here("reestr_temp"), pattern = ".csv") %>% | |
paste0(here::here("reestr_temp"), "/", .) %>% | |
map_df(~ read_csv( | |
., | |
col_types = cols(.default = "c"), | |
locale = locale(encoding = "cp1251") | |
)) %>% | |
mutate(across(where(is.character), str_replace_all, "[\r]", " ")) %>% | |
mutate(`No п/п` = as.numeric(`No п/п`)) %>% | |
arrange(`No п/п`) | |
# delete temp files | |
files <- dir(here::here("reestr_temp"), pattern = ".csv") | |
unlink(paste0(here::here("reestr_temp", "/"), files)) | |
# check | |
str(df) | |
# rename cols | |
col_names <- names(df) | |
col_names <- gsub("[\r]", " ", col_names) | |
names(df) <- col_names | |
# save data | |
saveRDS(df, | |
glue::glue("reestr_{format(date_update, '%d_%m_%Y')}.Rds")) | |
write_csv2( | |
df, | |
glue::glue("csv/reestr_{format(date_update, '%d_%m_%Y')}.csv"), | |
na = "") | |
# проходим авторизацию через сервисный аккаунт | |
# ссылка на google-таблицу https://docs.google.com/spreadsheets/d/161WggLLXWbdkIXohlGK0dorOiRR8ULrKVdjOtWSNjVg/edit?usp=sharing | |
# gs4_auth(path = "gsheets4r-4c0a163aaf22.json") | |
# write_sheet(df, ss = "161WggLLXWbdkIXohlGK0dorOiRR8ULrKVdjOtWSNjVg", | |
# sheet = "reestr_source") | |
# df <- readRDS("reestr.Rds") | |
# convert dates | |
df$`Дата рождения` <- as.Date(df$`Дата рождения`, "%d.%m.%Y") | |
df$`Дата принятия решения о включении` <- | |
as.Date(df$`Дата принятия решения о включении`, "%d.%m.%Y") | |
df$`Дата принятия решения об исключении` <- | |
as.Date(df$`Дата принятия решения об исключении`, "%d.%m.%Y") | |
df$Физлицо <- ifelse(!is.na(df$`Дата рождения`), 1, 0) | |
df$`Возраст ФЛ на дату обновления данных` <- | |
floor(as.numeric(difftime( | |
# as.Date(Sys.Date(), "%d.%m.%Y"), | |
date_update, | |
df$`Дата рождения`, | |
units = "days" | |
) / 365.25)) | |
df$`Возраст ФЛ на дату включения` <- | |
floor(as.numeric( | |
difftime( | |
df$`Дата принятия решения о включении`, | |
df$`Дата рождения`, | |
units = "days" | |
) / 365.25 | |
)) | |
df$`Возраст ФЛ на дату исключения` <- | |
floor(as.numeric( | |
difftime( | |
df$`Дата принятия решения об исключении`, | |
df$`Дата рождения`, | |
units = "days" | |
) / 365.25 | |
)) | |
df$`Дней в реестре` <- | |
ifelse( | |
is.na(df$`Дата принятия решения об исключении`), | |
as.numeric( | |
difftime( | |
# as.Date(Sys.Date(), "%d.%m.%Y"), | |
date_update, | |
df$`Дата принятия решения о включении`, | |
units = "days" | |
) | |
), | |
as.numeric( | |
difftime( | |
df$`Дата принятия решения об исключении`, | |
df$`Дата принятия решения о включении`, | |
units = "days" | |
) | |
) | |
) | |
df$`Исключен` <- | |
ifelse(!is.na(df$`Дата принятия решения об исключении`), 1, 0) | |
df$`Год включения` <- | |
lubridate::year(df$`Дата принятия решения о включении`) | |
df$`Год исключения` <- | |
lubridate::year(df$`Дата принятия решения об исключении`) | |
# save data | |
write_csv2(df, | |
glue::glue("csv/reestr_{format(date_update, '%d_%m_%Y')}_ext.csv"), | |
na = "") | |
saveRDS(df, | |
glue::glue("reestr_{format(date_update, '%d_%m_%Y')}_ext.Rds")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment