Skip to content

Instantly share code, notes, and snippets.

@justgrimes
Last active February 21, 2017 22:06
Show Gist options
  • Save justgrimes/542b4b94d9d2e592ea649955715e76e3 to your computer and use it in GitHub Desktop.
Save justgrimes/542b4b94d9d2e592ea649955715e76e3 to your computer and use it in GitHub Desktop.
OPM Fedscope Data - Cleaning and Loading
## OPM Workforce Data
## FedScope Employment Cube RAW Data Sets for general public consumption
## Justin Grimes
# Load necesary packages
#install.packages("curl")
#library(curl)
#install.packages("RCurl")
#library(RCurl)
#install.packages("sqldf")
#library(sqldf)
#install.packages("plyr")
#library(plyr)
# Download all relevant OPM data files #
MAR2016 <- "https://www.opm.gov/Data/Files/425/fdb8db0d-ce56-4966-9572-9692e6e0fa5e.zip"
MAR2015 <- "https://www.opm.gov/Data/Files/400/69b1781a-c3d3-4e91-a463-c7c9010fa763.zip"
MAR2014 <- "https://www.opm.gov/Data/Files/347/452c9a73-81d1-4ee1-a4b1-9b8bdc528b02.zip"
MAR2013 <- "https://www.opm.gov/Data/Files/321/52eda196-c039-43cb-a0c0-00c530814df6.zip"
MAR2012 <- "https://www.opm.gov/Data/Files/251/f7b6a1bf-8289-4538-a389-6d5aaa242e14.zip"
MAR2011 <- "https://www.opm.gov/Data/Files/241/2c0379b4-74d4-4a14-a4a5-37591d2b9c5f.zip"
MAR2010 <- "https://www.opm.gov/Data/Files/178/2a8f51c9-c0ba-4349-8c3d-055074b8794d.zip"
SEP2009 <- "https://www.opm.gov/Data/Files/26/f0a8eef6-a0b5-4015-a2f4-6597f1ca3ae7.zip"
SEP2008 <- "https://www.opm.gov/Data/Files/38/3653d805-eb0a-4e70-b96d-39b7c58347f0.zip"
SEP2007 <- "https://www.opm.gov/Data/Files/50/7b7655fd-b4d0-4e15-9e97-33956b8aca09.zip"
SEP2006 <- "https://www.opm.gov/Data/Files/53/b9945224-54e1-45e3-9780-b6b36b845b4b.zip"
SEP2005 <- "https://www.opm.gov/Data/Files/56/f00a1a8f-d865-4214-8051-049f66d322be.zip"
# Extract zip and load data as frame
MAR2016 <- download.file(MAR2015,destfile = temp, mode="wb")
FACTDATA_MAR2016 <- read.table(unzip(temp, "FACTDATA_MAR2016.TXT"), header= TRUE, sep = ",", quote = "\"")
MAR2015 <- download.file(MAR2015,destfile = temp, mode="wb")
FACTDATA_MAR2015 <- read.table(unzip(temp, "FACTDATA_MAR2015.TXT"), header= TRUE, sep = ",", quote = "\"")
MAR2014 <- download.file(MAR2014,destfile = temp, mode="wb")
FACTDATA_MAR2014 <- read.table(unzip(temp, "FACTDATA_MAR2014.TXT"), header= TRUE, sep = ",", quote = "\"")
MAR2013 <- download.file(MAR2013,destfile = temp, mode="wb")
FACTDATA_MAR2013 <- read.table(unzip(temp, "FACTDATA_MAR2013.TXT"), header= TRUE, sep = ",", quote = "\"")
MAR2012 <- download.file(MAR2012,destfile = temp, mode="wb")
FACTDATA_MAR2012 <- read.table(unzip(temp, "FACTDATA_MAR2012.TXT"), header= TRUE, sep = ",", quote = "\"")
MAR2011 <- download.file(MAR2011,destfile = temp, mode="wb")
FACTDATA_MAR2011 <- read.table(unzip(temp, "FACTDATA_MAR2011.TXT"), header= TRUE, sep = ",", quote = "\"")
MAR2010 <- download.file(MAR2010,destfile = temp, mode="wb")
FACTDATA_MAR2010 <- read.table(unzip(temp, "FACTDATA_MAR2010.TXT"), header= TRUE, sep = ",", quote = "\"")
SEP2009 <- download.file(SEP2009,destfile = temp, mode="wb")
FACTDATA_SEP2009 <- read.table(unzip(temp, "FACTDATA_SEP2009.TXT"), header= TRUE, sep = ",", quote = "\"")
SEP2008 <- download.file(SEP2008,destfile = temp, mode="wb")
FACTDATA_SEP2008 <- read.table(unzip(temp, "FACTDATA_SEP2008.TXT"), header= TRUE, sep = ",", quote = "\"")
SEP2007 <- download.file(SEP2007,destfile = temp, mode="wb")
FACTDATA_SEP2007 <- read.table(unzip(temp, "FACTDATA_SEP2007.TXT"), header= TRUE, sep = ",", quote = "\"")
SEP2006 <- download.file(SEP2006,destfile = temp, mode="wb")
FACTDATA_SEP2006 <- read.table(unzip(temp, "FACTDATA_SEP2006.TXT"), header= TRUE, sep = ",", quote = "\"")
SEP2005 <- download.file(SEP2005,destfile = temp, mode="wb")
FACTDATA_SEP2005 <- read.table(unzip(temp, "FACTDATA_SEP2005.TXT"), header= TRUE, sep = ",", quote = "\"")
# Mass merge data file
FACTDATA_05to16 <- rbind(FACTDATA_SEP2005, FACTDATA_SEP2006, FACTDATA_SEP2007, FACTDATA_SEP2008, FACTDATA_SEP2009,
FACTDATA_MAR2010, FACTDATA_MAR2011, FACTDATA_MAR2012, FACTDATA_MAR2013, FACTDATA_MAR2014,
FACTDATA_MAR2015, FACTDATA_MAR2016)
DTagy <- read.table(unzip(temp, "DTagy.txt"), header= TRUE, sep = ",", quote = "\"") #agency info table
DTagy <- read.table(unzip(temp, "DTagy.txt"), header= TRUE, sep = ",", quote = "\"") #agency info table
# Export trendfile
write.csv(FACTDATA_MAR2016, "FACTDATA_MAR2016.csv")
write.csv(FACTDATA_05to16, "FACTDATA_05to16.csv")
## Download and load recent FedScope Employment Cube (March 2016)
#temp = tempfile()
#URL <- "https://www.opm.gov/Data/Files/425/fdb8db0d-ce56-4966-9572-9692e6e0fa5e.zip"
#FACTDATA_MAR2016 <- read.table(unzip(temp, "FACTDATA_MAR2016.TXT"), header= TRUE, sep = ",", quote = "\"")
#DTagy <- read.table(unzip(temp, "DTagy.txt"), header= TRUE, sep = ",", quote = "\"") #agency info table
# Append relevant agency info to main data table
#fd <- sqldf("SELECT * FROM FACTDATA_MAR2016 LEFT JOIN DTagy ON FACTDATA_MAR2016.AGYSUB = DTagy.AGYSUB")
# filter out and keep only 2210s; OCC = occupation code
#fd2210 <- sqldf("SELECT * FROM FACTDATA_MAR2016 WHERE occ='2210'")
## File Lookup Ref ##
# FACTDATA.txt - main data file
# DTloc.txt - location lookup
# DTagelvl.txt - Age
# DTagy.txt - Agency info
# DTdate.txt - Date
# DTedlvl.txt - Education
# DTgsegrd.txt - GS
# DTloslvl.txt - Length of service
# DTocc.txt - occupation
# DTpatco.txt -
# DTppgrd.txt - Pay Plan
# DTsallvl.txt - Salary
# DTstemocc.txt - STEM
# DTsuper.txt - Supervisory
# DTtoa.txt - Appointment (perm vs nonperm)
# DTwkstat.txt - Workstatus (seasonal vs nonseasonal)
# DTwrksch.txt - Work schedule (seasonal vs nonseasonal)
## Common IT Workforce Occupational Series ##
# 2210 - Information Technology Management
# 0301 - Miscellaneous Administration and Program
# 0340 - Program Management
# 0343 - Management and Program Analysis
# 0332 - Computer Operation
# 0334 - Computer Specialist (replaced by 2210)
# 0335 - Computer Clerk and Assistant Series
# 0390 - Telecommunications Processing Series
# 0391 - Telecommunications Series
# 0392 - General Telecommunication Series
# 0394 - Communications Clerical Series
# 0854 - Computer Engineering
# 0855 - Electronics Engineering
# 1410 - Librarian Series
# 1411 - Library Technician Series
# 1412 - Technical Information Services Series
# 1420 - Archivist Series
# 1421 - Archives Technician Series
# 1550 - Computer Science Series
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment