Skip to content

Instantly share code, notes, and snippets.

@lagoan
Created December 1, 2022 17:22
Show Gist options
  • Save lagoan/b57cab1a58d0b82762f05f9d06c0a9d6 to your computer and use it in GitHub Desktop.
Save lagoan/b57cab1a58d0b82762f05f9d06c0a9d6 to your computer and use it in GitHub Desktop.
Script used to generate report on ERA Item's with potential problems in created date field.
# frozen_string_literal: true
# Run report to find items in which dates where modified by adding the month 01
# and/or the day 01
class DateReportGenerator
def initialize
@root_directory = './era_audit/'
# Looking for dates in formats YYYY-MM-DD, YYYY-MM, YYYY
@date_format = /^(\d{4})(?:-(\d{2}))?(?:-(\d{2}))?$/
@errors = []
@time_current = Time.current.to_formatted_s(:number)
@csv_warning = CSV.open("#{@root_directory}date_warning_#{@time_current}.csv", 'wb',
write_headers: true,
headers: ['Type', 'URL', 'Original date', 'Changed date'])
end
def run
run_check
ensure
@csv_warning.close
end
private
# This method tests when a date was originally created with month 01 and day
# 01. This problem comes when the original date was changed on
# ingestion/deposit either through the batch ingestion worfflow or the user
# interface
def date_problem_by_month_and_day?(entity)
entity.created.match(@date_format) do |match|
match.captures[1] == '01' && match.captures[2] == '01'
end
end
# We will consider possible problem dates with changes in with the format
# yyyy-mm -> yyyy-mm-01
# yyyy -> yyyy-01-01
# These changes are tracked through updates in the item's history
def date_problem_by_versions?(entity)
entity.versions.each do |version|
@errors = []
next unless version.event == 'update'
next unless version.changeset['created']
created_value = version.changeset['created']
date_matches = get_date_matches(created_value)
next if date_missing?(entity, date_matches, created_value)
# In practice, a change in day from nil to 01 catches both instances of the problem
next unless date_matches[:original][3].nil? && date_matches[:change][3] == '01'
check_for_version_errors(created_value, date_matches)
return true
end
false
end
def date_missing?(entity, date_matches, created_value)
if date_matches[:original].nil? || date_matches[:change].nil?
@csv_warning << [entity.class.name, get_entity_url(entity)] + created_value
return true
end
false
end
def just_year?(entity)
entity.created.match(@date_format) do |match|
match.captures[0].present? && match.captures[1].nil? && match.captures[2].nil?
end
end
def just_year_month?(entity)
entity.created.match(@date_format) do |match|
match.captures[0].present? && match.captures[1].present? && match.captures[2].nil?
end
end
def get_date_matches(created_value)
{
original: created_value[0].match(@date_format),
change: created_value[1].match(@date_format)
}
end
def check_for_version_errors(created_value, date_matches)
problem_dates = "#{created_value[0]} to #{created_value[1]}"
@errors[1] = "Day 01 was added from #{problem_dates}"
return if date_matches[:original][2].nil? && date_matches[:change][2] == '01'
@errors[0] = "Month 01 was added from #{problem_dates}"
end
def get_entity_headers(entity_attributes, klass)
entity_attributes.map do |key|
if klass.rdf_annotation_for_attr(key).present?
RDF::URI(klass.rdf_annotation_for_attr(key).first.predicate).pname.to_s
else
key
end
end
end
def get_entity_url(entity)
if entity.instance_of?(Item)
Rails.application.routes.url_helpers.item_url(id: entity.id)
elsif entity.instance_of?(Thesis)
Rails.application.routes.url_helpers.thesis_url(id: entity.id)
end
end
def run_check
# At this time we are only checking the created date for the Item model. I
# am keeping this pattern of listing classes if we want to change the models
# in the future
[Item].each do |klass|
entity_type = klass.name.underscore
entity_attributes = klass.first.attributes.keys
file_name_simple = "#{@root_directory}/#{entity_type}_date_errors_current_day_#{@time_current}.csv"
file_name_version = "#{@root_directory}/#{entity_type}_date_errors_in_updates_#{@time_current}.csv"
file_name_just_year = "#{@root_directory}/#{entity_type}_just_year_#{@time_current}.csv"
file_name_just_year_month = "#{@root_directory}/#{entity_type}_just_year_month_#{@time_current}.csv"
entity_headers = get_entity_headers(entity_attributes, klass)
begin
simple_headers = ['URL'] + entity_headers
version_headers = ['URL'] + entity_headers + ['Month problem', 'Day problem']
csv_simple = CSV.open(file_name_simple, 'wb', write_headers: true, headers: simple_headers)
csv_version = CSV.open(file_name_version, 'wb', write_headers: true,
headers: version_headers)
csv_just_year = CSV.open(file_name_just_year, 'wb', write_headers: true, headers: simple_headers)
csv_just_year_month = CSV.open(file_name_just_year_month, 'wb', write_headers: true, headers: simple_headers)
klass.find_each do |entity|
entity_url = get_entity_url(entity)
entity_values = entity.values_at(entity_attributes)
csv_simple << ([entity_url] + entity_values) if date_problem_by_month_and_day?(entity)
csv_version << ([entity_url] + entity_values + @errors) if date_problem_by_versions?(entity)
csv_just_year << ([entity_url] + entity_values) if just_year?(entity)
csv_just_year_month << ([entity_url] + entity_values) if just_year_month?(entity)
end
ensure
csv_simple.close
csv_version.close
csv_just_year.close
csv_just_year_month.close
end
end
end
end
report_generator = DateReportGenerator.new
report_generator.run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment