Skip to content

Instantly share code, notes, and snippets.

@iangow
Last active August 29, 2019 00:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iangow/82be926b307f8ceb5b7432da020a912a to your computer and use it in GitHub Desktop.
Save iangow/82be926b307f8ceb5b7432da020a912a to your computer and use it in GitHub Desktop.
Code to rearrange NAICS codes
setwd("C:/Users/mjyje/OneDrive/RA work/vertically related industries")
library(readxl)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(stringr)
library(readr)
# Import input-output matrix ----
temp <- read_xlsx("IOUse_After_Redefinitions_PUR_2007_Detail.xlsx",
sheet = '2007', skip = 5)
descs <-
temp %>%
select(1:2)
io_table <-
temp %>%
rename(input_code = Code) %>%
select(-2) %>%
gather(key = "output_code", value = "amount", -input_code) %>%
filter(!is.na(amount))
io_table <-
io_table %>%
arrange(input_code)
io_table_total_output <-
io_table %>%
filter(output_code == "T001") %>%
select(input_code, amount) %>%
rename(total_output_amount = amount)
io_table_total_input <-
io_table %>%
filter(input_code == "T005") %>%
select(output_code, amount) %>%
rename(total_input_amount = amount)
# Don't use mixed case
# Skip intermediate variables
# Use boolean values (not 1/0)
# Use if_else(), not ifelse() [though neither needed here]
merged_io_table <-
io_table %>%
inner_join(io_table_total_output, by = "input_code") %>%
inner_join(io_table_total_input, by = "output_code") %>%
mutate(onepd = (amount/total_output_amount + amount/total_input_amount) > 0.01,
fivepd = (amount/total_output_amount + amount/total_input_amount) > 0.05,
tenpd = (amount/total_output_amount + amount/total_input_amount) > 0.1) %>%
select(input_code, output_code, onepd, fivepd, tenpd)
# Creating lines of code like this creates risk
# of errors (such as the one in your code)
merged_io_table_1 <-
merged_io_table %>%
filter(!grepl("^[TF]", output_code),
(!grepl("^[TV]", input_code)))
# Create separate tables for onepd, fivepd, tenpd)
onepd <-
merged_io_table_1 %>%
filter(onepd==TRUE) %>%
select(input_code, output_code, onepd)
fivepd <-
merged_io_table %>%
filter(fivepd==TRUE) %>%
select(input_code, output_code, fivepd)
tenpd <-
merged_io_table %>%
filter(tenpd==TRUE) %>%
select(input_code, output_code, tenpd)
# Import NAICS mapping ----
temp <- read_xlsx("IOUse_After_Redefinitions_PUR_2007_Detail.xlsx",
sheet = 'NAICS codes')
name_cols <- function(df) {
names(df) <- c("bea_code", "naics_code")
return(df)
}
naics_codes <-
temp %>%
select(3, 6) %>%
name_cols() %>%
filter(!is.na(bea_code), !is.na(naics_code)) %>%
separate_rows(naics_code, sep = ",\\s*")
convert_range <- function(str) {
# Function that takes, say, "1120-3" and turns it
# into 1120, 1121, 1122, 1123 as a vector.
split <- str_split(str, "-")
rhs_end <- split[[1]][2]
lhs_val <- split[[1]][1]
rhs_val <- paste0(str_sub(lhs_val, 1, str_length(lhs_val)-1), split[[1]][2])
rhs_val
return(list(seq(lhs_val, rhs_val)))
}
range_values <-
naics_codes %>%
filter(grepl("-", naics_code)) %>%
rowwise() %>%
mutate(naics_values = convert_range(naics_code)) %>%
unnest(naics_values) %>%
mutate(naics_values = as.character(naics_values))
single_values
single_values <-
naics_codes %>%
filter(!grepl("-", naics_code), naics_code != "n/a") %>%
mutate(naics_values = naics_code)
max_digits <- 6L
expand_range <- function(str) {
# Function that takes "1120", turns into a
# range of 6-digit values 112000-112099.
n_digits <- str_length(str)
if (n_digits == max_digits) {
return(as.integer(str))
} else {
multiple <- 10^(max_digits - n_digits)
return(list(seq(as.integer(str)*multiple,
(as.integer(str)+1)*multiple-1)))
}
}
# Code below takes rows with "range_values" (e.g, "1113-1115") and
# combines them with rows with "single_values" (e.g., "1123").
# It then takes, say, "1123", which is a four-digit code and
# expands it to the range 112300-112399.
naics_values <-
range_values %>%
dplyr::union(single_values) %>%
# ungroup() %>%
rowwise() %>%
mutate(naics_value = expand_range(naics_values)) %>%
unnest(naics_value) %>%
select(bea_code, naics_value)
### To load data from Melb Uni database
Sys.setenv(PGUSER="far", PGPASSWORD="honours_2019")
Sys.setenv(PGHOST="10.101.13.99", PGDATABASE="crsp")
library(DBI)
library(dplyr, warn.conflicts = FALSE)
library(lubridate)
pg <- dbConnect(RPostgres::Postgres())
company <- tbl(pg, sql("SELECT * FROM comp.company"))
company_downloaded <-
company %>%
filter(!is.na(naics)) %>%
mutate(naics = as.integer(naics)) %>%
select(naics) %>%
distinct() %>%
collect()
company_data <-
company_downloaded %>%
rowwise() %>%
mutate(naics = expand_range(naics)) %>%
unnest(naics) %>%
distinct() %>%
select(naics)
write_csv(company_data1, path = "company_data1.csv")
# Use inner_join, not merge.
# In other words, use tidyverse/dplyr functions, which
# will work on other data sources (such as databases)
merged_io_input_onepd_naics <-
onepd %>%
inner_join(naics_values,
by = c("input_code"="bea_code")) %>%
mutate(naics_value = as.integer(naics_value), naics_input = naics_value) %>%
select(output_code, naics_input)
merged_io_output_onepd_naics <-
onepd %>%
inner_join(naics_values,
by = c("output_code"="bea_code")) %>%
mutate(naics_value = as.integer(naics_value), naics_output = naics_value) %>%
select(output_code, naics_output)
merged_compustat_io_input_onepd_naics <-
company_data %>%
inner_join(merged_io_input_onepd_naics,
by = c("naics"="naics_input")) %>%
mutate(naics_input = naics) %>%
distinct() %>%
select(naics_input, output_code)
merged_compustat_io_output_onepd_naics <-
company_data %>%
inner_join(merged_io_output_onepd_naics,
by = c("naics"="naics_output")) %>%
mutate(naics_output = naics) %>%
distinct() %>%
select(naics_output, output_code)
merged_compustat_io_input_output_onepd_naics <-
merged_compustat_io_output_onepd_naics %>%
inner_join(merged_compustat_io_input_onepd_naics,
by = c("output_code"="output_code")) %>%
distinct() %>%
select(naics_input, naics_output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment