Created
December 1, 2022 17:22
-
-
Save lagoan/b57cab1a58d0b82762f05f9d06c0a9d6 to your computer and use it in GitHub Desktop.
Script used to generate report on ERA Item's with potential problems in created date field.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
# Run report to find items in which dates where modified by adding the month 01 | |
# and/or the day 01 | |
class DateReportGenerator | |
def initialize | |
@root_directory = './era_audit/' | |
# Looking for dates in formats YYYY-MM-DD, YYYY-MM, YYYY | |
@date_format = /^(\d{4})(?:-(\d{2}))?(?:-(\d{2}))?$/ | |
@errors = [] | |
@time_current = Time.current.to_formatted_s(:number) | |
@csv_warning = CSV.open("#{@root_directory}date_warning_#{@time_current}.csv", 'wb', | |
write_headers: true, | |
headers: ['Type', 'URL', 'Original date', 'Changed date']) | |
end | |
def run | |
run_check | |
ensure | |
@csv_warning.close | |
end | |
private | |
# This method tests when a date was originally created with month 01 and day | |
# 01. This problem comes when the original date was changed on | |
# ingestion/deposit either through the batch ingestion worfflow or the user | |
# interface | |
def date_problem_by_month_and_day?(entity) | |
entity.created.match(@date_format) do |match| | |
match.captures[1] == '01' && match.captures[2] == '01' | |
end | |
end | |
# We will consider possible problem dates with changes in with the format | |
# yyyy-mm -> yyyy-mm-01 | |
# yyyy -> yyyy-01-01 | |
# These changes are tracked through updates in the item's history | |
def date_problem_by_versions?(entity) | |
entity.versions.each do |version| | |
@errors = [] | |
next unless version.event == 'update' | |
next unless version.changeset['created'] | |
created_value = version.changeset['created'] | |
date_matches = get_date_matches(created_value) | |
next if date_missing?(entity, date_matches, created_value) | |
# In practice, a change in day from nil to 01 catches both instances of the problem | |
next unless date_matches[:original][3].nil? && date_matches[:change][3] == '01' | |
check_for_version_errors(created_value, date_matches) | |
return true | |
end | |
false | |
end | |
def date_missing?(entity, date_matches, created_value) | |
if date_matches[:original].nil? || date_matches[:change].nil? | |
@csv_warning << [entity.class.name, get_entity_url(entity)] + created_value | |
return true | |
end | |
false | |
end | |
def just_year?(entity) | |
entity.created.match(@date_format) do |match| | |
match.captures[0].present? && match.captures[1].nil? && match.captures[2].nil? | |
end | |
end | |
def just_year_month?(entity) | |
entity.created.match(@date_format) do |match| | |
match.captures[0].present? && match.captures[1].present? && match.captures[2].nil? | |
end | |
end | |
def get_date_matches(created_value) | |
{ | |
original: created_value[0].match(@date_format), | |
change: created_value[1].match(@date_format) | |
} | |
end | |
def check_for_version_errors(created_value, date_matches) | |
problem_dates = "#{created_value[0]} to #{created_value[1]}" | |
@errors[1] = "Day 01 was added from #{problem_dates}" | |
return if date_matches[:original][2].nil? && date_matches[:change][2] == '01' | |
@errors[0] = "Month 01 was added from #{problem_dates}" | |
end | |
def get_entity_headers(entity_attributes, klass) | |
entity_attributes.map do |key| | |
if klass.rdf_annotation_for_attr(key).present? | |
RDF::URI(klass.rdf_annotation_for_attr(key).first.predicate).pname.to_s | |
else | |
key | |
end | |
end | |
end | |
def get_entity_url(entity) | |
if entity.instance_of?(Item) | |
Rails.application.routes.url_helpers.item_url(id: entity.id) | |
elsif entity.instance_of?(Thesis) | |
Rails.application.routes.url_helpers.thesis_url(id: entity.id) | |
end | |
end | |
def run_check | |
# At this time we are only checking the created date for the Item model. I | |
# am keeping this pattern of listing classes if we want to change the models | |
# in the future | |
[Item].each do |klass| | |
entity_type = klass.name.underscore | |
entity_attributes = klass.first.attributes.keys | |
file_name_simple = "#{@root_directory}/#{entity_type}_date_errors_current_day_#{@time_current}.csv" | |
file_name_version = "#{@root_directory}/#{entity_type}_date_errors_in_updates_#{@time_current}.csv" | |
file_name_just_year = "#{@root_directory}/#{entity_type}_just_year_#{@time_current}.csv" | |
file_name_just_year_month = "#{@root_directory}/#{entity_type}_just_year_month_#{@time_current}.csv" | |
entity_headers = get_entity_headers(entity_attributes, klass) | |
begin | |
simple_headers = ['URL'] + entity_headers | |
version_headers = ['URL'] + entity_headers + ['Month problem', 'Day problem'] | |
csv_simple = CSV.open(file_name_simple, 'wb', write_headers: true, headers: simple_headers) | |
csv_version = CSV.open(file_name_version, 'wb', write_headers: true, | |
headers: version_headers) | |
csv_just_year = CSV.open(file_name_just_year, 'wb', write_headers: true, headers: simple_headers) | |
csv_just_year_month = CSV.open(file_name_just_year_month, 'wb', write_headers: true, headers: simple_headers) | |
klass.find_each do |entity| | |
entity_url = get_entity_url(entity) | |
entity_values = entity.values_at(entity_attributes) | |
csv_simple << ([entity_url] + entity_values) if date_problem_by_month_and_day?(entity) | |
csv_version << ([entity_url] + entity_values + @errors) if date_problem_by_versions?(entity) | |
csv_just_year << ([entity_url] + entity_values) if just_year?(entity) | |
csv_just_year_month << ([entity_url] + entity_values) if just_year_month?(entity) | |
end | |
ensure | |
csv_simple.close | |
csv_version.close | |
csv_just_year.close | |
csv_just_year_month.close | |
end | |
end | |
end | |
end | |
report_generator = DateReportGenerator.new | |
report_generator.run |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment