Last active
August 29, 2015 14:21
-
-
Save justgrimes/8af11053d3b13ff68825 to your computer and use it in GitHub Desktop.
Public library broadband data - Loads, cleans, and merges Public Library Survey data with National Broadband Map Community Anchor Institution data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to load public public library data from FY 2012 Public Library Survey (PLS) from Institute of Museum and Library Services | |
# and merge it with most recent broadband data from National Broadband Map Community Anchor Institution data | |
# | |
# FOR REFERENCE | |
# CAIID - Unique federal ID's associated with the CAICAT | |
# CAICAT -these are the unique federal ID's associated with the CAICAT | |
# MAXADDOWN - Maximum Advertised Download Speed subscribed to at this CAI | |
# MAXADUP - Maximum Advertised Upload Speed subscribed to at this CAI | |
# BBSERVICE - Does institution subscribe to broadband service at location (ex. Y=Yes; N=No; U=Unknown), | |
# PUBLICWIFI - Whether or not they offer free wifi to the public (ex. Y=Yes; N=No; I or U=Unknown), | |
###########TransTech########### | |
#Code Description | |
# 10 Asymmetric xDSL | |
# 20 Symmetric xDSL | |
# 30 Other Copper Wire | |
# 40 Cable Modem - DOCSIS 3.0 Down | |
# 41 Cable Model - Other | |
# 50 Optical Carrier/Fiber to the End User | |
# 60 Satellite | |
# 70 Terrestrial Fixed - Unlicensed | |
# 71 Terrestrial Fixed - Licensed | |
# 80 Terrestrial Mobile Wireless | |
# 90 Electric Power Line | |
# 0 All Other | |
###########MaxAdDown########### | |
# Code Description | |
# 2 Greater than 200 kbps and less than 768 kbps | |
# 3 Greater than 768 kbps and less than 1.5 mbps | |
# 4 Greater than 1.5 mbps and less than 3 mbps | |
# 5 Greater than 3 mbps and less than 6 mbps | |
# 6 Greater than 6 mbps and less than 10 mbps | |
# 7 Greater than 10 mbps and less than 25 mbps | |
# 8 Greater than 25 mbps and less than 50 mbps | |
# 9 Greater than 50 mbps and less than 100 mbps | |
# 10 Greater than 100 mbps and less than 1 gbps | |
# 11 Greater than 1 gbps | |
###########MaxAdDown########### | |
# Code Description | |
# 2 Greater than 200 kbps and less than 768 kbps | |
# 3 Greater than 768 kbps and less than 1.5 mbps | |
# 4 Greater than 1.5 mbps and less than 3 mbps | |
# 5 Greater than 3 mbps and less than 6 mbps | |
# 6 Greater than 6 mbps and less than 10 mbps | |
# 7 Greater than 10 mbps and less than 25 mbps | |
# 8 Greater than 25 mbps and less than 50 mbps | |
# 9 Greater than 50 mbps and less than 100 mbps | |
# 10 Greater than 100 mbps and less than 1 gbps | |
# 11 Greater than 1 gbps | |
library(stringr) | |
library(stringdist) | |
library(sqldf) | |
# Load Public Library Survey FY2012 data directly from the web | |
temp <- tempfile() | |
download.file("http://www.imls.gov/assets/1/AssetManager/pupld12a_csv.zip", temp, mode="wb") | |
unzip(temp, "Pupld12a.csv") #public library administrative entities (9000+); most data points are here at this level | |
unzip(temp, "Puout12a.csv") #public library outlets (17,000) | |
unzip(temp, "Pusum12a.csv") #public library state summaries | |
pupld12a <- read.csv("Pupld12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3")) | |
puout12a <- read.csv("Puout12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3")) | |
pusum12a <- read.csv("Pusum12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3")) | |
# Load NTIA National Broadband Map data directly from the web | |
temp <- tempfile() | |
download.file("http://www2.ntia.doc.gov/files/broadband-data/All-NBM-CAI-June-2014.zip", temp, mode="wb") | |
unzip(temp, "All-NBM-CAI-June-2014/All-NBM-CAI-June-2014.csv") #NTIA CAI broadband data | |
cai <- read.csv("All-NBM-CAI-June-2014/All-NBM-CAI-June-2014.csv", header = TRUE, sep="|", quote = "\"") | |
#keep only libraries from NBM CAI dataset (i.e. caicat=2) | |
cailib <- cai[ which(cai$caicat==2), ] | |
#clean up Community Anchor Institute ids (CAIID) aka PLS FSCS IDs; remove partials and invalids | |
cailib$fscsid <- sub("\\-", "", cailib$caiid) #remove break | |
for (i in 1:nrow(cailib)) { | |
if (str_length(cailib$fscsid[i])==9) { | |
cailib$fscsid[i] <- str_c(substr(cailib$fscsid[i], 1, 6),"-",substr(cailib$fscsid[i], 7, 10)) | |
} else {cailib$fscsid[i]= ""} | |
} | |
# blindly deduplicate | |
cailib <- x[order(x$fscsid, decreasing=TRUE),] | |
cailib <- x[ !duplicated( x[,c('fscsid')] ), ] | |
#reformat FSCS ID in PLS | |
pls12 <- puout12a | |
for (i in 1:nrow(pls12)) { | |
pls12$fscsid[i] <- str_c(pls12$FSCSKEY[i],"-",sprintf("%03d", as.numeric(pls12$FSCS_SEQ[i]))) | |
} | |
#merge datasets on PLS FSCS ID | |
pls12cai14 <- sqldf("select pls12.fscsid as FSCS_ID, pls12.libname, pls12.address, pls12.city, pls12.stabr as state, | |
pls12.zip, pls12.C_OUT_TY, pls12.longitud as LONGITUDE, pls12.latitude, | |
cailib.bbservice, cailib.transtech, cailib.maxaddown, cailib.maxadup, cailib.PUBLICWIFI | |
from pls12 left join cailib on pls12.fscsid=cailib.fscsid") | |
# clean missing/invalid data | |
pls12cai14$maxaddown[pls12cai14$maxaddown=="ZZ"] <- NA | |
pls12cai14$maxaddown[pls12cai14$maxaddown=="99"] <- NA | |
pls12cai14$maxaddown[pls12cai14$maxaddown=="55"] <- NA | |
pls12cai14$maxaddown[pls12cai14$maxaddown=="0"] <- NA | |
pls12cai14$maxaddown[pls12cai14$maxaddown=="1"] <- NA | |
pls12cai14$maxaddown[pls12cai14$maxaddown==""] <- NA | |
pls12cai14$maxadup[pls12cai14$maxadup=="ZZ"] <- NA | |
pls12cai14$maxadup[pls12cai14$maxadup=="99"] <- NA | |
pls12cai14$maxadup[pls12cai14$maxadup=="12"] <- NA | |
pls12cai14$maxadup[pls12cai14$maxadup=="0"] <- NA | |
pls12cai14$maxadup[pls12cai14$maxadup=="1"] <- NA | |
pls12cai14$maxadup[pls12cai14$maxadup==""] <- NA | |
pls12cai14$transtech[pls12cai14$transtech=="-999"] <- NA | |
pls12cai14$bbservice[pls12cai14$bbservice==NA] <- "U" | |
pls12cai14$publicwifi[pls12cai14$publicwifi==NA] <- "U" | |
pls12cai14$publicwifi[pls12cai14$publicwifi=="No"] <- "N" | |
pls12cai14$BBFCC <- "N" | |
pls12cai14$BBFCC[is.na(pls12cai14$maxadup) == TRUE | is.na(pls12cai14$maxaddown) == TRUE] <- NA | |
pls12cai14$BBFCC[pls12cai14$maxadup >= 5 & pls12cai14$maxaddown >= 8] <- "Y" | |
# export data file | |
write.csv(pls12cai14, "pls12cai14.csv", row.names=FALSE) | |
# export data file | |
write.csv(pls12cai14, "pls12cai14.csv", row.names=FALSE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment