Last active
August 29, 2019 00:48
-
-
Save iangow/82be926b307f8ceb5b7432da020a912a to your computer and use it in GitHub Desktop.
Code to rearrange NAICS codes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("C:/Users/mjyje/OneDrive/RA work/vertically related industries") | |
library(readxl) | |
library(dplyr, warn.conflicts = FALSE) | |
library(tidyr) | |
library(stringr) | |
library(readr) | |
# Import input-output matrix ---- | |
temp <- read_xlsx("IOUse_After_Redefinitions_PUR_2007_Detail.xlsx", | |
sheet = '2007', skip = 5) | |
descs <- | |
temp %>% | |
select(1:2) | |
io_table <- | |
temp %>% | |
rename(input_code = Code) %>% | |
select(-2) %>% | |
gather(key = "output_code", value = "amount", -input_code) %>% | |
filter(!is.na(amount)) | |
io_table <- | |
io_table %>% | |
arrange(input_code) | |
io_table_total_output <- | |
io_table %>% | |
filter(output_code == "T001") %>% | |
select(input_code, amount) %>% | |
rename(total_output_amount = amount) | |
io_table_total_input <- | |
io_table %>% | |
filter(input_code == "T005") %>% | |
select(output_code, amount) %>% | |
rename(total_input_amount = amount) | |
# Don't use mixed case | |
# Skip intermediate variables | |
# Use boolean values (not 1/0) | |
# Use if_else(), not ifelse() [though neither needed here] | |
merged_io_table <- | |
io_table %>% | |
inner_join(io_table_total_output, by = "input_code") %>% | |
inner_join(io_table_total_input, by = "output_code") %>% | |
mutate(onepd = (amount/total_output_amount + amount/total_input_amount) > 0.01, | |
fivepd = (amount/total_output_amount + amount/total_input_amount) > 0.05, | |
tenpd = (amount/total_output_amount + amount/total_input_amount) > 0.1) %>% | |
select(input_code, output_code, onepd, fivepd, tenpd) | |
# Creating lines of code like this creates risk | |
# of errors (such as the one in your code) | |
merged_io_table_1 <- | |
merged_io_table %>% | |
filter(!grepl("^[TF]", output_code), | |
(!grepl("^[TV]", input_code))) | |
# Create separate tables for onepd, fivepd, tenpd) | |
onepd <- | |
merged_io_table_1 %>% | |
filter(onepd==TRUE) %>% | |
select(input_code, output_code, onepd) | |
fivepd <- | |
merged_io_table %>% | |
filter(fivepd==TRUE) %>% | |
select(input_code, output_code, fivepd) | |
tenpd <- | |
merged_io_table %>% | |
filter(tenpd==TRUE) %>% | |
select(input_code, output_code, tenpd) | |
# Import NAICS mapping ---- | |
temp <- read_xlsx("IOUse_After_Redefinitions_PUR_2007_Detail.xlsx", | |
sheet = 'NAICS codes') | |
name_cols <- function(df) { | |
names(df) <- c("bea_code", "naics_code") | |
return(df) | |
} | |
naics_codes <- | |
temp %>% | |
select(3, 6) %>% | |
name_cols() %>% | |
filter(!is.na(bea_code), !is.na(naics_code)) %>% | |
separate_rows(naics_code, sep = ",\\s*") | |
convert_range <- function(str) { | |
# Function that takes, say, "1120-3" and turns it | |
# into 1120, 1121, 1122, 1123 as a vector. | |
split <- str_split(str, "-") | |
rhs_end <- split[[1]][2] | |
lhs_val <- split[[1]][1] | |
rhs_val <- paste0(str_sub(lhs_val, 1, str_length(lhs_val)-1), split[[1]][2]) | |
rhs_val | |
return(list(seq(lhs_val, rhs_val))) | |
} | |
range_values <- | |
naics_codes %>% | |
filter(grepl("-", naics_code)) %>% | |
rowwise() %>% | |
mutate(naics_values = convert_range(naics_code)) %>% | |
unnest(naics_values) %>% | |
mutate(naics_values = as.character(naics_values)) | |
single_values | |
single_values <- | |
naics_codes %>% | |
filter(!grepl("-", naics_code), naics_code != "n/a") %>% | |
mutate(naics_values = naics_code) | |
max_digits <- 6L | |
expand_range <- function(str) { | |
# Function that takes "1120", turns into a | |
# range of 6-digit values 112000-112099. | |
n_digits <- str_length(str) | |
if (n_digits == max_digits) { | |
return(as.integer(str)) | |
} else { | |
multiple <- 10^(max_digits - n_digits) | |
return(list(seq(as.integer(str)*multiple, | |
(as.integer(str)+1)*multiple-1))) | |
} | |
} | |
# Code below takes rows with "range_values" (e.g, "1113-1115") and | |
# combines them with rows with "single_values" (e.g., "1123"). | |
# It then takes, say, "1123", which is a four-digit code and | |
# expands it to the range 112300-112399. | |
naics_values <- | |
range_values %>% | |
dplyr::union(single_values) %>% | |
# ungroup() %>% | |
rowwise() %>% | |
mutate(naics_value = expand_range(naics_values)) %>% | |
unnest(naics_value) %>% | |
select(bea_code, naics_value) | |
### To load data from Melb Uni database | |
Sys.setenv(PGUSER="far", PGPASSWORD="honours_2019") | |
Sys.setenv(PGHOST="10.101.13.99", PGDATABASE="crsp") | |
library(DBI) | |
library(dplyr, warn.conflicts = FALSE) | |
library(lubridate) | |
pg <- dbConnect(RPostgres::Postgres()) | |
company <- tbl(pg, sql("SELECT * FROM comp.company")) | |
company_downloaded <- | |
company %>% | |
filter(!is.na(naics)) %>% | |
mutate(naics = as.integer(naics)) %>% | |
select(naics) %>% | |
distinct() %>% | |
collect() | |
company_data <- | |
company_downloaded %>% | |
rowwise() %>% | |
mutate(naics = expand_range(naics)) %>% | |
unnest(naics) %>% | |
distinct() %>% | |
select(naics) | |
write_csv(company_data1, path = "company_data1.csv") | |
# Use inner_join, not merge. | |
# In other words, use tidyverse/dplyr functions, which | |
# will work on other data sources (such as databases) | |
merged_io_input_onepd_naics <- | |
onepd %>% | |
inner_join(naics_values, | |
by = c("input_code"="bea_code")) %>% | |
mutate(naics_value = as.integer(naics_value), naics_input = naics_value) %>% | |
select(output_code, naics_input) | |
merged_io_output_onepd_naics <- | |
onepd %>% | |
inner_join(naics_values, | |
by = c("output_code"="bea_code")) %>% | |
mutate(naics_value = as.integer(naics_value), naics_output = naics_value) %>% | |
select(output_code, naics_output) | |
merged_compustat_io_input_onepd_naics <- | |
company_data %>% | |
inner_join(merged_io_input_onepd_naics, | |
by = c("naics"="naics_input")) %>% | |
mutate(naics_input = naics) %>% | |
distinct() %>% | |
select(naics_input, output_code) | |
merged_compustat_io_output_onepd_naics <- | |
company_data %>% | |
inner_join(merged_io_output_onepd_naics, | |
by = c("naics"="naics_output")) %>% | |
mutate(naics_output = naics) %>% | |
distinct() %>% | |
select(naics_output, output_code) | |
merged_compustat_io_input_output_onepd_naics <- | |
merged_compustat_io_output_onepd_naics %>% | |
inner_join(merged_compustat_io_input_onepd_naics, | |
by = c("output_code"="output_code")) %>% | |
distinct() %>% | |
select(naics_input, naics_output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment