Skip to content

Instantly share code, notes, and snippets.

@justgrimes
Last active August 29, 2015 14:21
Show Gist options
  • Save justgrimes/8af11053d3b13ff68825 to your computer and use it in GitHub Desktop.
Save justgrimes/8af11053d3b13ff68825 to your computer and use it in GitHub Desktop.
Public library broadband data - Loads, cleans, and merges Public Library Survey data with National Broadband Map Community Anchor Institution data
# Script to load public public library data from FY 2012 Public Library Survey (PLS) from Institute of Museum and Library Services
# and merge it with most recent broadband data from National Broadband Map Community Anchor Institution data
#
# FOR REFERENCE
# CAIID - Unique federal ID's associated with the CAICAT
# CAICAT -these are the unique federal ID's associated with the CAICAT
# MAXADDOWN - Maximum Advertised Download Speed subscribed to at this CAI
# MAXADUP - Maximum Advertised Upload Speed subscribed to at this CAI
# BBSERVICE - Does institution subscribe to broadband service at location (ex. Y=Yes; N=No; U=Unknown),
# PUBLICWIFI - Whether or not they offer free wifi to the public (ex. Y=Yes; N=No; I or U=Unknown),
###########TransTech###########
#Code Description
# 10 Asymmetric xDSL
# 20 Symmetric xDSL
# 30 Other Copper Wire
# 40 Cable Modem - DOCSIS 3.0 Down
# 41 Cable Model - Other
# 50 Optical Carrier/Fiber to the End User
# 60 Satellite
# 70 Terrestrial Fixed - Unlicensed
# 71 Terrestrial Fixed - Licensed
# 80 Terrestrial Mobile Wireless
# 90 Electric Power Line
# 0 All Other
###########MaxAdDown###########
# Code Description
# 2 Greater than 200 kbps and less than 768 kbps
# 3 Greater than 768 kbps and less than 1.5 mbps
# 4 Greater than 1.5 mbps and less than 3 mbps
# 5 Greater than 3 mbps and less than 6 mbps
# 6 Greater than 6 mbps and less than 10 mbps
# 7 Greater than 10 mbps and less than 25 mbps
# 8 Greater than 25 mbps and less than 50 mbps
# 9 Greater than 50 mbps and less than 100 mbps
# 10 Greater than 100 mbps and less than 1 gbps
# 11 Greater than 1 gbps
###########MaxAdDown###########
# Code Description
# 2 Greater than 200 kbps and less than 768 kbps
# 3 Greater than 768 kbps and less than 1.5 mbps
# 4 Greater than 1.5 mbps and less than 3 mbps
# 5 Greater than 3 mbps and less than 6 mbps
# 6 Greater than 6 mbps and less than 10 mbps
# 7 Greater than 10 mbps and less than 25 mbps
# 8 Greater than 25 mbps and less than 50 mbps
# 9 Greater than 50 mbps and less than 100 mbps
# 10 Greater than 100 mbps and less than 1 gbps
# 11 Greater than 1 gbps
library(stringr)
library(stringdist)
library(sqldf)
# Load Public Library Survey FY2012 data directly from the web
temp <- tempfile()
download.file("http://www.imls.gov/assets/1/AssetManager/pupld12a_csv.zip", temp, mode="wb")
unzip(temp, "Pupld12a.csv") #public library administrative entities (9000+); most data points are here at this level
unzip(temp, "Puout12a.csv") #public library outlets (17,000)
unzip(temp, "Pusum12a.csv") #public library state summaries
pupld12a <- read.csv("Pupld12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3"))
puout12a <- read.csv("Puout12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3"))
pusum12a <- read.csv("Pusum12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3"))
# Load NTIA National Broadband Map data directly from the web
temp <- tempfile()
download.file("http://www2.ntia.doc.gov/files/broadband-data/All-NBM-CAI-June-2014.zip", temp, mode="wb")
unzip(temp, "All-NBM-CAI-June-2014/All-NBM-CAI-June-2014.csv") #NTIA CAI broadband data
cai <- read.csv("All-NBM-CAI-June-2014/All-NBM-CAI-June-2014.csv", header = TRUE, sep="|", quote = "\"")
#keep only libraries from NBM CAI dataset (i.e. caicat=2)
cailib <- cai[ which(cai$caicat==2), ]
#clean up Community Anchor Institute ids (CAIID) aka PLS FSCS IDs; remove partials and invalids
cailib$fscsid <- sub("\\-", "", cailib$caiid) #remove break
for (i in 1:nrow(cailib)) {
if (str_length(cailib$fscsid[i])==9) {
cailib$fscsid[i] <- str_c(substr(cailib$fscsid[i], 1, 6),"-",substr(cailib$fscsid[i], 7, 10))
} else {cailib$fscsid[i]= ""}
}
# blindly deduplicate
cailib <- x[order(x$fscsid, decreasing=TRUE),]
cailib <- x[ !duplicated( x[,c('fscsid')] ), ]
#reformat FSCS ID in PLS
pls12 <- puout12a
for (i in 1:nrow(pls12)) {
pls12$fscsid[i] <- str_c(pls12$FSCSKEY[i],"-",sprintf("%03d", as.numeric(pls12$FSCS_SEQ[i])))
}
#merge datasets on PLS FSCS ID
pls12cai14 <- sqldf("select pls12.fscsid as FSCS_ID, pls12.libname, pls12.address, pls12.city, pls12.stabr as state,
pls12.zip, pls12.C_OUT_TY, pls12.longitud as LONGITUDE, pls12.latitude,
cailib.bbservice, cailib.transtech, cailib.maxaddown, cailib.maxadup, cailib.PUBLICWIFI
from pls12 left join cailib on pls12.fscsid=cailib.fscsid")
# clean missing/invalid data
pls12cai14$maxaddown[pls12cai14$maxaddown=="ZZ"] <- NA
pls12cai14$maxaddown[pls12cai14$maxaddown=="99"] <- NA
pls12cai14$maxaddown[pls12cai14$maxaddown=="55"] <- NA
pls12cai14$maxaddown[pls12cai14$maxaddown=="0"] <- NA
pls12cai14$maxaddown[pls12cai14$maxaddown=="1"] <- NA
pls12cai14$maxaddown[pls12cai14$maxaddown==""] <- NA
pls12cai14$maxadup[pls12cai14$maxadup=="ZZ"] <- NA
pls12cai14$maxadup[pls12cai14$maxadup=="99"] <- NA
pls12cai14$maxadup[pls12cai14$maxadup=="12"] <- NA
pls12cai14$maxadup[pls12cai14$maxadup=="0"] <- NA
pls12cai14$maxadup[pls12cai14$maxadup=="1"] <- NA
pls12cai14$maxadup[pls12cai14$maxadup==""] <- NA
pls12cai14$transtech[pls12cai14$transtech=="-999"] <- NA
pls12cai14$bbservice[pls12cai14$bbservice==NA] <- "U"
pls12cai14$publicwifi[pls12cai14$publicwifi==NA] <- "U"
pls12cai14$publicwifi[pls12cai14$publicwifi=="No"] <- "N"
pls12cai14$BBFCC <- "N"
pls12cai14$BBFCC[is.na(pls12cai14$maxadup) == TRUE | is.na(pls12cai14$maxaddown) == TRUE] <- NA
pls12cai14$BBFCC[pls12cai14$maxadup >= 5 & pls12cai14$maxaddown >= 8] <- "Y"
# export data file
write.csv(pls12cai14, "pls12cai14.csv", row.names=FALSE)
# export data file
write.csv(pls12cai14, "pls12cai14.csv", row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment