jclopeztavera/parse_curp.R

## parse_curp.R
#' ---
#' title: "parse_curp"
#' author: "Juan C. López Tavera"
#' date: "10/6/2017"
#' output:
#'   html_document:
#'     keep_md: yes
#'   pdf_document: default
#' ---
#'
#' # CURP Parser
#'
#' ## Description
#' Use `parse_curp` if you have a CURP key (`x`) you want to parse by defining which element (`element`) of the CURP should be extracted (See **Arguments**)
#'
#' ## Usage
#' `parse_curp(x, element = c("all", "gender", "bdate", "pob"))`
#'
#'
#' ## Arguments
#'
#' * `x` - a syntactically valid CURP key
#' * `element` - a demographic attribute to extract from the CURP key
#'
#' ## Details
#'
#' CURP (Clave Única de Registro de Población) is the unique key used by Mexican government to identify Mexican citizens and residents in Mexico.
#'
#' CURP keys are 18 alphanumeric character-long strings. Characters is the CURP refer to:
#'
#' * Last name
#' * First name
#' * Birth date
#' * Binary gender
#' * State of birth
#'
#' The last two characters in the CURP key ensure its uniqueness.
#'
#' Consider the fictitious CURP: `"AAPR630321HDFLRC09"`
#'
#' AA - first last name initial and first vowel in first last name (second vowel if the first last name initial is a vowel)
#' P - second last name initial letter, X if there's ony one last name
#' R - first name initial letter
#' 63- birth year (%y)
#' 03 - birth month (%m)
#' 21 - birth day (%d)
#' H - binary gender. H, for male; M, for female
#' DF - two-digit State of birth abbreviation (NE for people born outside of Mexico)
#' LRC - first consonant letter of first last name, second last name, and first name (X, if there's no element)
#' 09 - two digits to ensure uniqueness of CURP key
#'
## ----parse_curp----------------------------------------------------------
parse_curp <- function(curp, element = "all") {
  ##### Controls ####
  valid_curp <-
    (!sapply(curp, is.na) &
       sapply(curp, nchar) == 18) &
    !grepl(pattern = "[^[:alnum:]]+", x = curp)
  curp <- ifelse(test = valid_curp, curp, NA)
  ##### Lookup table ####
  states <- structure(
    list(
      state = c(
        "Aguascalientes",
        "Baja California",
        "Baja California Sur",
        "Campeche",
        "Chiapas",
        "Chihuahua",
        "Coahuila",
        "Colima",
        "Ciudad de México",
        "Durango",
        "Guanajuato",
        "Guerrero",
        "Hidalgo",
        "Jalisco",
        "Estado de México",
        "Michoacán",
        "Morelos",
        "Nayarit",
        "Nuevo León",
        "Oaxaca",
        "Puebla",
        "Querétaro",
        "Quintana Roo",
        "San Luis Potosí",
        "Sinaloa",
        "Sonora",
        "Tabasco",
        "Tamaulipas",
        "Tlaxcala",
        "Veracruz",
        "Yucatán",
        "Zacatecas"
      ),
      state.abb = c(
        "AS",
        "BC",
        "BS",
        "CC",
        "CS",
        "CH",
        "CL",
        "CM",
        "DF",
        "DG",
        "GT",
        "GR",
        "HG",
        "JC",
        "MC",
        "MN",
        "MS",
        "NT",
        "NL",
        "OC",
        "PL",
        "QT",
        "QR",
        "SP",
        "SL",
        "SR",
        "TC",
        "TS",
        "TL",
        "VZ",
        "YN",
        "ZS"
      )
    ),
    .Names = c("state",
               "state.abb"),
    row.names = c(NA, -32L),
    class = "data.frame"
  )
  ##### Core functions ####
  parse_gender <- function(x) {
    y <- substr(x, 11, 11)
    y <- ifelse(y == "H", "Male", "Female")
    return(y)
  }
  parse_pob <- function(x) {
    y <- substr(x, 12, 13)
    y <-
      states[match(y, states$state.abb), ]$state
    return(y)
  }
  parse_bdate <- function(x) {
    y <- substr(x, start = 5, stop = 10)
    y <-
      ifelse(as.numeric(substr(y, 1, 2) <= as.POSIXlt.date(Sys.Date())$year - 100),
             paste0(20, y),
             paste0(19, y))
    y <- as.Date(x = y, format = "%Y%m%d")
    return(y)
  }
  #### Switch ####
  y <- switch(
    element,
    gender = parse_gender(curp),
    pob = parse_pob(curp),
    bdate = parse_bdate(curp),
    all = list(
      "Gender" = parse_gender(curp),
      "Birth Date" = parse_bdate(curp),
      "Place of Birth" = parse_pob(curp)
    )
  )
  return(y)
}
	#' ---
	#' title: "parse_curp"
	#' author: "Juan C. López Tavera"
	#' date: "10/6/2017"
	#' output:
	#' html_document:
	#' keep_md: yes
	#' pdf_document: default
	#' ---
	#'
	#' # CURP Parser
	#'
	#' ## Description
	#' Use `parse_curp` if you have a CURP key (`x`) you want to parse by defining which element (`element`) of the CURP should be extracted (See Arguments)
	#'
	#' ## Usage
	#' `parse_curp(x, element = c("all", "gender", "bdate", "pob"))`
	#'
	#'
	#' ## Arguments
	#'
	#' * `x` - a syntactically valid CURP key
	#' * `element` - a demographic attribute to extract from the CURP key
	#'
	#' ## Details
	#'
	#' CURP (Clave Única de Registro de Población) is the unique key used by Mexican government to identify Mexican citizens and residents in Mexico.
	#'
	#' CURP keys are 18 alphanumeric character-long strings. Characters is the CURP refer to:
	#'
	#' * Last name
	#' * First name
	#' * Birth date
	#' * Binary gender
	#' * State of birth
	#'
	#' The last two characters in the CURP key ensure its uniqueness.
	#'
	#' Consider the fictitious CURP: `"AAPR630321HDFLRC09"`
	#'
	#' AA - first last name initial and first vowel in first last name (second vowel if the first last name initial is a vowel)
	#' P - second last name initial letter, X if there's ony one last name
	#' R - first name initial letter
	#' 63- birth year (%y)
	#' 03 - birth month (%m)
	#' 21 - birth day (%d)
	#' H - binary gender. H, for male; M, for female
	#' DF - two-digit State of birth abbreviation (NE for people born outside of Mexico)
	#' LRC - first consonant letter of first last name, second last name, and first name (X, if there's no element)
	#' 09 - two digits to ensure uniqueness of CURP key
	#'
	## ----parse_curp----------------------------------------------------------
	parse_curp <- function(curp, element = "all") {
	##### Controls ####
	valid_curp <-
	(!sapply(curp, is.na) &
	sapply(curp, nchar) == 18) &
	!grepl(pattern = "[^[:alnum:]]+", x = curp)
	curp <- ifelse(test = valid_curp, curp, NA)
	##### Lookup table ####
	states <- structure(
	list(
	state = c(
	"Aguascalientes",
	"Baja California",
	"Baja California Sur",
	"Campeche",
	"Chiapas",
	"Chihuahua",
	"Coahuila",
	"Colima",
	"Ciudad de México",
	"Durango",
	"Guanajuato",
	"Guerrero",
	"Hidalgo",
	"Jalisco",
	"Estado de México",
	"Michoacán",
	"Morelos",
	"Nayarit",
	"Nuevo León",
	"Oaxaca",
	"Puebla",
	"Querétaro",
	"Quintana Roo",
	"San Luis Potosí",
	"Sinaloa",
	"Sonora",
	"Tabasco",
	"Tamaulipas",
	"Tlaxcala",
	"Veracruz",
	"Yucatán",
	"Zacatecas"
	),
	state.abb = c(
	"AS",
	"BC",
	"BS",
	"CC",
	"CS",
	"CH",
	"CL",
	"CM",
	"DF",
	"DG",
	"GT",
	"GR",
	"HG",
	"JC",
	"MC",
	"MN",
	"MS",
	"NT",
	"NL",
	"OC",
	"PL",
	"QT",
	"QR",
	"SP",
	"SL",
	"SR",
	"TC",
	"TS",
	"TL",
	"VZ",
	"YN",
	"ZS"
	)
	),
	.Names = c("state",
	"state.abb"),
	row.names = c(NA, -32L),
	class = "data.frame"
	)
	##### Core functions ####
	parse_gender <- function(x) {
	y <- substr(x, 11, 11)
	y <- ifelse(y == "H", "Male", "Female")
	return(y)
	}
	parse_pob <- function(x) {
	y <- substr(x, 12, 13)
	y <-
	states[match(y, states$state.abb), ]$state
	return(y)
	}
	parse_bdate <- function(x) {
	y <- substr(x, start = 5, stop = 10)
	y <-
	ifelse(as.numeric(substr(y, 1, 2) <= as.POSIXlt.date(Sys.Date())$year - 100),
	paste0(20, y),
	paste0(19, y))
	y <- as.Date(x = y, format = "%Y%m%d")
	return(y)
	}
	#### Switch ####
	y <- switch(
	element,
	gender = parse_gender(curp),
	pob = parse_pob(curp),
	bdate = parse_bdate(curp),
	all = list(
	"Gender" = parse_gender(curp),
	"Birth Date" = parse_bdate(curp),
	"Place of Birth" = parse_pob(curp)
	)
	)
	return(y)
	}