Last active
February 21, 2017 22:06
-
-
Save justgrimes/542b4b94d9d2e592ea649955715e76e3 to your computer and use it in GitHub Desktop.
OPM Fedscope Data - Cleaning and Loading
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## OPM Workforce Data | |
## FedScope Employment Cube RAW Data Sets for general public consumption | |
## Justin Grimes | |
# Load necesary packages | |
#install.packages("curl") | |
#library(curl) | |
#install.packages("RCurl") | |
#library(RCurl) | |
#install.packages("sqldf") | |
#library(sqldf) | |
#install.packages("plyr") | |
#library(plyr) | |
# Download all relevant OPM data files # | |
MAR2016 <- "https://www.opm.gov/Data/Files/425/fdb8db0d-ce56-4966-9572-9692e6e0fa5e.zip" | |
MAR2015 <- "https://www.opm.gov/Data/Files/400/69b1781a-c3d3-4e91-a463-c7c9010fa763.zip" | |
MAR2014 <- "https://www.opm.gov/Data/Files/347/452c9a73-81d1-4ee1-a4b1-9b8bdc528b02.zip" | |
MAR2013 <- "https://www.opm.gov/Data/Files/321/52eda196-c039-43cb-a0c0-00c530814df6.zip" | |
MAR2012 <- "https://www.opm.gov/Data/Files/251/f7b6a1bf-8289-4538-a389-6d5aaa242e14.zip" | |
MAR2011 <- "https://www.opm.gov/Data/Files/241/2c0379b4-74d4-4a14-a4a5-37591d2b9c5f.zip" | |
MAR2010 <- "https://www.opm.gov/Data/Files/178/2a8f51c9-c0ba-4349-8c3d-055074b8794d.zip" | |
SEP2009 <- "https://www.opm.gov/Data/Files/26/f0a8eef6-a0b5-4015-a2f4-6597f1ca3ae7.zip" | |
SEP2008 <- "https://www.opm.gov/Data/Files/38/3653d805-eb0a-4e70-b96d-39b7c58347f0.zip" | |
SEP2007 <- "https://www.opm.gov/Data/Files/50/7b7655fd-b4d0-4e15-9e97-33956b8aca09.zip" | |
SEP2006 <- "https://www.opm.gov/Data/Files/53/b9945224-54e1-45e3-9780-b6b36b845b4b.zip" | |
SEP2005 <- "https://www.opm.gov/Data/Files/56/f00a1a8f-d865-4214-8051-049f66d322be.zip" | |
# Extract zip and load data as frame | |
MAR2016 <- download.file(MAR2015,destfile = temp, mode="wb") | |
FACTDATA_MAR2016 <- read.table(unzip(temp, "FACTDATA_MAR2016.TXT"), header= TRUE, sep = ",", quote = "\"") | |
MAR2015 <- download.file(MAR2015,destfile = temp, mode="wb") | |
FACTDATA_MAR2015 <- read.table(unzip(temp, "FACTDATA_MAR2015.TXT"), header= TRUE, sep = ",", quote = "\"") | |
MAR2014 <- download.file(MAR2014,destfile = temp, mode="wb") | |
FACTDATA_MAR2014 <- read.table(unzip(temp, "FACTDATA_MAR2014.TXT"), header= TRUE, sep = ",", quote = "\"") | |
MAR2013 <- download.file(MAR2013,destfile = temp, mode="wb") | |
FACTDATA_MAR2013 <- read.table(unzip(temp, "FACTDATA_MAR2013.TXT"), header= TRUE, sep = ",", quote = "\"") | |
MAR2012 <- download.file(MAR2012,destfile = temp, mode="wb") | |
FACTDATA_MAR2012 <- read.table(unzip(temp, "FACTDATA_MAR2012.TXT"), header= TRUE, sep = ",", quote = "\"") | |
MAR2011 <- download.file(MAR2011,destfile = temp, mode="wb") | |
FACTDATA_MAR2011 <- read.table(unzip(temp, "FACTDATA_MAR2011.TXT"), header= TRUE, sep = ",", quote = "\"") | |
MAR2010 <- download.file(MAR2010,destfile = temp, mode="wb") | |
FACTDATA_MAR2010 <- read.table(unzip(temp, "FACTDATA_MAR2010.TXT"), header= TRUE, sep = ",", quote = "\"") | |
SEP2009 <- download.file(SEP2009,destfile = temp, mode="wb") | |
FACTDATA_SEP2009 <- read.table(unzip(temp, "FACTDATA_SEP2009.TXT"), header= TRUE, sep = ",", quote = "\"") | |
SEP2008 <- download.file(SEP2008,destfile = temp, mode="wb") | |
FACTDATA_SEP2008 <- read.table(unzip(temp, "FACTDATA_SEP2008.TXT"), header= TRUE, sep = ",", quote = "\"") | |
SEP2007 <- download.file(SEP2007,destfile = temp, mode="wb") | |
FACTDATA_SEP2007 <- read.table(unzip(temp, "FACTDATA_SEP2007.TXT"), header= TRUE, sep = ",", quote = "\"") | |
SEP2006 <- download.file(SEP2006,destfile = temp, mode="wb") | |
FACTDATA_SEP2006 <- read.table(unzip(temp, "FACTDATA_SEP2006.TXT"), header= TRUE, sep = ",", quote = "\"") | |
SEP2005 <- download.file(SEP2005,destfile = temp, mode="wb") | |
FACTDATA_SEP2005 <- read.table(unzip(temp, "FACTDATA_SEP2005.TXT"), header= TRUE, sep = ",", quote = "\"") | |
# Mass merge data file | |
FACTDATA_05to16 <- rbind(FACTDATA_SEP2005, FACTDATA_SEP2006, FACTDATA_SEP2007, FACTDATA_SEP2008, FACTDATA_SEP2009, | |
FACTDATA_MAR2010, FACTDATA_MAR2011, FACTDATA_MAR2012, FACTDATA_MAR2013, FACTDATA_MAR2014, | |
FACTDATA_MAR2015, FACTDATA_MAR2016) | |
DTagy <- read.table(unzip(temp, "DTagy.txt"), header= TRUE, sep = ",", quote = "\"") #agency info table | |
DTagy <- read.table(unzip(temp, "DTagy.txt"), header= TRUE, sep = ",", quote = "\"") #agency info table | |
# Export trendfile | |
write.csv(FACTDATA_MAR2016, "FACTDATA_MAR2016.csv") | |
write.csv(FACTDATA_05to16, "FACTDATA_05to16.csv") | |
## Download and load recent FedScope Employment Cube (March 2016) | |
#temp = tempfile() | |
#URL <- "https://www.opm.gov/Data/Files/425/fdb8db0d-ce56-4966-9572-9692e6e0fa5e.zip" | |
#FACTDATA_MAR2016 <- read.table(unzip(temp, "FACTDATA_MAR2016.TXT"), header= TRUE, sep = ",", quote = "\"") | |
#DTagy <- read.table(unzip(temp, "DTagy.txt"), header= TRUE, sep = ",", quote = "\"") #agency info table | |
# Append relevant agency info to main data table | |
#fd <- sqldf("SELECT * FROM FACTDATA_MAR2016 LEFT JOIN DTagy ON FACTDATA_MAR2016.AGYSUB = DTagy.AGYSUB") | |
# filter out and keep only 2210s; OCC = occupation code | |
#fd2210 <- sqldf("SELECT * FROM FACTDATA_MAR2016 WHERE occ='2210'") | |
## File Lookup Ref ## | |
# FACTDATA.txt - main data file | |
# DTloc.txt - location lookup | |
# DTagelvl.txt - Age | |
# DTagy.txt - Agency info | |
# DTdate.txt - Date | |
# DTedlvl.txt - Education | |
# DTgsegrd.txt - GS | |
# DTloslvl.txt - Length of service | |
# DTocc.txt - occupation | |
# DTpatco.txt - | |
# DTppgrd.txt - Pay Plan | |
# DTsallvl.txt - Salary | |
# DTstemocc.txt - STEM | |
# DTsuper.txt - Supervisory | |
# DTtoa.txt - Appointment (perm vs nonperm) | |
# DTwkstat.txt - Workstatus (seasonal vs nonseasonal) | |
# DTwrksch.txt - Work schedule (seasonal vs nonseasonal) | |
## Common IT Workforce Occupational Series ## | |
# 2210 - Information Technology Management | |
# 0301 - Miscellaneous Administration and Program | |
# 0340 - Program Management | |
# 0343 - Management and Program Analysis | |
# 0332 - Computer Operation | |
# 0334 - Computer Specialist (replaced by 2210) | |
# 0335 - Computer Clerk and Assistant Series | |
# 0390 - Telecommunications Processing Series | |
# 0391 - Telecommunications Series | |
# 0392 - General Telecommunication Series | |
# 0394 - Communications Clerical Series | |
# 0854 - Computer Engineering | |
# 0855 - Electronics Engineering | |
# 1410 - Librarian Series | |
# 1411 - Library Technician Series | |
# 1412 - Technical Information Services Series | |
# 1420 - Archivist Series | |
# 1421 - Archives Technician Series | |
# 1550 - Computer Science Series | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment