technickle/ValidateOpen311GeoReportBulk.r

## ValidateOpen311GeoReportBulk.r
# this R script evaluates a data file for compatibility with the Open311 GeoReport Bulk specification.
#   see here for the most recent version of the specification:
#     http://wiki.open311.org/GeoReport/bulk
#
# it implements nearly all of the checks identified in this document
#   https://docs.google.com/document/d/1GLRniiT3xvmG-i6PPeZPZDK_FhBDGCpuVh5fCexEiys/preview
# however, it is very bare bones and the results need to be interpreted.
#
# written by Andrew Nicklin (@technickle) with contributions from the Open311 community.
#
# prerequesite libraries: readr, validate
#   these can be installed by entering the following commands at an R prompt.
#     packages.install("readr")
#     packages.install("validate")


library(readr)
library(validate)

print("Checking for critical issues...")

print("loading and parsing data.")
testdata <- readr::read_csv("YOUR FILENAME HERE",
    na = c("","NA"),
    col_types = cols(
    .default = col_character(),
    requested_datetime = col_datetime(),
    updated_datetime = col_datetime(),
    closed_date = col_datetime(),
    lat = col_number(),
    long = col_number()
    )

  )
print("Interpreting the above: an error, particularly with datetime columns, indicates that the data values are probably not in ISO8601 format. If there is an error for closed_date, it may be that you didn't include this optional field in your data. Other errors should also be investigated, and are probably due to file structure or content formatting.")

print("Required columns and null values:")
nulls_critical <- validate::validator(
  !is.na(service_request_id),
  !is.na(requested_datetime),
  !is.na(updated_datetime),
  !is.na(status_description),
  !is.na(status_notes),
  !is.na(service_name),
  !is.na(address),
  !is.na(lat),
  !is.na(long)
)
names(nulls_critical) <- c('service_request_id','requested_datetime','updated_datetime','status_description','status_notes','service_name','address','lat','long')
summary(validate::confront(testdata, nulls_critical))
print("Interpreting the above: a TRUE in the `error` column means the required column is missing. A value greater than 0 in the `fails` column means required values in that field are missing for the displayed number of records. In a perfect dataset, all `items` and `passes` numbers are identical, and all `errors` are FALSE.")

print("Duplicate values in service_request_id:")
sprintf("dulicate service_request_id values found (should be 0): %s", length(testdata$service_request_id[duplicated(testdata$service_request_id)]))

# alternate approach
sprintf("alternate count of duplicate service_request_id values found (should be 0): %s", length(testdata$service_request_id) - length(unique(testdata$service_request_id)))

print("Invalid lat/longs")
invalid_lat_longs <- validate::validator(
  lat > -90.0,
  lat < 90.0,
  long > -180.0,
  long < 180.0,
  lat != 0,
  long != 0
)
names(invalid_lat_longs) <- c("lat < -90","lat > 90","long < -180","long > 180", "lat = 0", "long = 0")
summary(validate::confront(testdata, invalid_lat_longs))
print("Interpreting the above: any values other than 0 in the `fails` column means you may have issues accurately rendering some records on a map.")

# warnings
print("Checking for non-critical issues...")

print("Additional columns and their values:")
nulls_noncritical <- validate::validator(
  !is.na(closed_date),
  !is.na(source),
  !is.na(service_subtype),
  !is.na(description),
  !is.na(agency_responsible)
)
names(nulls_noncritical) <- c("closed_date","source","service_subtype","description","agency_responsible")
summary(validate::confront(testdata, nulls_noncritical))
print("Interpreting the above: a TRUE in the `error` column means a non-required column is been omitted. Values other than zero in the `fails` column represent the number of records which are missing data for that column. A closer look may be required if those values seem very high compared to the number in `items`.")

print("Date consistency checking:")
date_consistency <- validate::validator(
  requested_datetime <= updated_datetime,
  requested_datetime <= closed_date
)
names(date_consistency) <- c("requested_datetime <= updated_datetime", "requested_datetime <= closed_datetime")
summary(validate::confront(testdata, date_consistency))
print("Interpreting the above: a TRUE in the error column means one of the columns is missing. A value greater than 0 in the `fails` column indicates the number of records which have follow-up date values that *precede* request_datetime. Performing an elapsed time calculation will produce a negative or invalid result for each of these records. A value other than 0 in the `nNA` column means the data value in one of the columns (hopefully not `requested_datetime`) is not present. This might be acceptable, particularly for service requests that are not yet resolved.")

print("Reporting number of values for columns which may be used to group the data. Lower numbers are preferred. Exceeding the recommended number doesn't necessarily mean your data is incorrect, but it might be worth verifying.")
sprintf("Number of unique service_name values (less than 200 is preferred): %s", length(unique(testdata$service_name)))
sprintf("Number of unique agency_responsible values (less than 50 is preferred): %s", length(unique(testdata$agency_responsible)))
sprintf("Number of unique source values (less than 20 is preferred): %s", length(unique(testdata$source)))
sprintf("Number of unique status_description values (less than 10 is preferred): %s", length(unique(testdata$status_description)))
print("Test complete!")
	# this R script evaluates a data file for compatibility with the Open311 GeoReport Bulk specification.
	# see here for the most recent version of the specification:
	# http://wiki.open311.org/GeoReport/bulk
	#
	# it implements nearly all of the checks identified in this document
	# https://docs.google.com/document/d/1GLRniiT3xvmG-i6PPeZPZDK_FhBDGCpuVh5fCexEiys/preview
	# however, it is very bare bones and the results need to be interpreted.
	#
	# written by Andrew Nicklin (@technickle) with contributions from the Open311 community.
	#
	# prerequesite libraries: readr, validate
	# these can be installed by entering the following commands at an R prompt.
	# packages.install("readr")
	# packages.install("validate")


	library(readr)
	library(validate)

	print("Checking for critical issues...")

	print("loading and parsing data.")
	testdata <- readr::read_csv("YOUR FILENAME HERE",
	na = c("","NA"),
	col_types = cols(
	.default = col_character(),
	requested_datetime = col_datetime(),
	updated_datetime = col_datetime(),
	closed_date = col_datetime(),
	lat = col_number(),
	long = col_number()
	)

	)
	print("Interpreting the above: an error, particularly with datetime columns, indicates that the data values are probably not in ISO8601 format. If there is an error for closed_date, it may be that you didn't include this optional field in your data. Other errors should also be investigated, and are probably due to file structure or content formatting.")

	print("Required columns and null values:")
	nulls_critical <- validate::validator(
	!is.na(service_request_id),
	!is.na(requested_datetime),
	!is.na(updated_datetime),
	!is.na(status_description),
	!is.na(status_notes),
	!is.na(service_name),
	!is.na(address),
	!is.na(lat),
	!is.na(long)
	)
	names(nulls_critical) <- c('service_request_id','requested_datetime','updated_datetime','status_description','status_notes','service_name','address','lat','long')
	summary(validate::confront(testdata, nulls_critical))
	print("Interpreting the above: a TRUE in the `error` column means the required column is missing. A value greater than 0 in the `fails` column means required values in that field are missing for the displayed number of records. In a perfect dataset, all `items` and `passes` numbers are identical, and all `errors` are FALSE.")

	print("Duplicate values in service_request_id:")
	sprintf("dulicate service_request_id values found (should be 0): %s", length(testdata$service_request_id[duplicated(testdata$service_request_id)]))

	# alternate approach
	sprintf("alternate count of duplicate service_request_id values found (should be 0): %s", length(testdata$service_request_id) - length(unique(testdata$service_request_id)))

	print("Invalid lat/longs")
	invalid_lat_longs <- validate::validator(
	lat > -90.0,
	lat < 90.0,
	long > -180.0,
	long < 180.0,
	lat != 0,
	long != 0
	)
	names(invalid_lat_longs) <- c("lat < -90","lat > 90","long < -180","long > 180", "lat = 0", "long = 0")
	summary(validate::confront(testdata, invalid_lat_longs))
	print("Interpreting the above: any values other than 0 in the `fails` column means you may have issues accurately rendering some records on a map.")

	# warnings
	print("Checking for non-critical issues...")

	print("Additional columns and their values:")
	nulls_noncritical <- validate::validator(
	!is.na(closed_date),
	!is.na(source),
	!is.na(service_subtype),
	!is.na(description),
	!is.na(agency_responsible)
	)
	names(nulls_noncritical) <- c("closed_date","source","service_subtype","description","agency_responsible")
	summary(validate::confront(testdata, nulls_noncritical))
	print("Interpreting the above: a TRUE in the `error` column means a non-required column is been omitted. Values other than zero in the `fails` column represent the number of records which are missing data for that column. A closer look may be required if those values seem very high compared to the number in `items`.")

	print("Date consistency checking:")
	date_consistency <- validate::validator(
	requested_datetime <= updated_datetime,
	requested_datetime <= closed_date
	)
	names(date_consistency) <- c("requested_datetime <= updated_datetime", "requested_datetime <= closed_datetime")
	summary(validate::confront(testdata, date_consistency))
	print("Interpreting the above: a TRUE in the error column means one of the columns is missing. A value greater than 0 in the `fails` column indicates the number of records which have follow-up date values that precede request_datetime. Performing an elapsed time calculation will produce a negative or invalid result for each of these records. A value other than 0 in the `nNA` column means the data value in one of the columns (hopefully not `requested_datetime`) is not present. This might be acceptable, particularly for service requests that are not yet resolved.")

	print("Reporting number of values for columns which may be used to group the data. Lower numbers are preferred. Exceeding the recommended number doesn't necessarily mean your data is incorrect, but it might be worth verifying.")
	sprintf("Number of unique service_name values (less than 200 is preferred): %s", length(unique(testdata$service_name)))
	sprintf("Number of unique agency_responsible values (less than 50 is preferred): %s", length(unique(testdata$agency_responsible)))
	sprintf("Number of unique source values (less than 20 is preferred): %s", length(unique(testdata$source)))
	sprintf("Number of unique status_description values (less than 10 is preferred): %s", length(unique(testdata$status_description)))
	print("Test complete!")