Skip to content

Instantly share code, notes, and snippets.

@stompro
Last active June 16, 2018 18:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stompro/d036829ff1fd4e0fe3dcddf8a75b4e77 to your computer and use it in GitHub Desktop.
Save stompro/d036829ff1fd4e0fe3dcddf8a75b4e77 to your computer and use it in GitHub Desktop.
Purge unused authority records - Evergreen ILS
Before you try to clean up your authority records, you should have run the authority linking scripts,
and then set them up to run on a regular basis via cron.
#Link bibs to authorities
/openils/bin/authority_control_fields.pl -a
#Link authorities to other authorities.
/openils/bin/authority_authority_linker.pl -a
-- While double checking authority removal, I found that our system
-- had 120 deleted bibs referenced in metabib.browse_entry_def_map,
-- which caused some deleted authority records to be referenced there
-- also.
-- -- Deleted bibs associated with metabib.browse_entry_def_map -- why?
select distinct bre.id,bre.fingerprint, bre.deleted from metabib.browse_entry_def_map mbedm
join biblio.record_entry bre on bre.id=mbedm.source and bre.deleted
order by bre.id;
-- Update them to fix them - triggers a reingest for those records
update biblio.record_entry
set id=id
where
id in (
select distinct bre.id from metabib.browse_entry_def_map mbedm
join biblio.record_entry bre on bre.id=mbedm.source and bre.deleted
order by bre.id
)
;
-----------------
-- Backup authority records that are not linked, these are the ones
-- that will be purged,
-- change the schema and table to suit your needs
create table larl_temp.authority_record_entry_deleted_backup_6_2_2018
as
-- List of all linked authorities
with used_authorities as (
select distinct linked from (
select source as linked from authority.authority_linking aal
union
select target as linked from authority.authority_linking aal
union
select authority as linked from authority.bib_linking abl
union
select authority as linked from metabib.browse_entry_def_map mbedm where authority is not null
) x
)
-- Unused authority records
select are.* from authority.record_entry are
left outer join used_authorities ua on ua.linked=are.id
where
ua.linked is null -- we only want authorities that are not in the linked list.
and not are.deleted
order by id
--limit 1000
;
-- make a note/calendar entry to remove this table at some point.
-- you could also export it to a file to keep it somewhere else, not as part of the database.
-- -------------------------------------------------
-- Mark unused/unlinked authority records as deleted
-- Disable a few triggers to speed up the process, since we are only deleting the records
-- these triggers don't make any changes and can be disabled.
ALTER TABLE authority.record_entry DISABLE TRIGGER update_headings_tgr;
ALTER TABLE authority.record_entry DISABLE TRIGGER a_marcxml_is_well_formed;
ALTER TABLE authority.record_entry DISABLE TRIGGER b_maintain_901;
ALTER TABLE authority.record_entry DISABLE TRIGGER c_maintain_control_numbers;
ALTER TABLE authority.record_entry DISABLE TRIGGER map_thesaurus_to_control_set;
begin;
--create temp table with all the authorities that are not linked to anything.
create temporary table unused_auth
on commit drop
as
-- List of all linked authorities
with used_authorities as (
select distinct linked from (
select source as linked from authority.authority_linking aal
union
select target as linked from authority.authority_linking aal
union
select authority as linked from authority.bib_linking abl
union
select authority as linked from metabib.browse_entry_def_map mbedm where authority is not null
) x
)
-- Unused authority records
select are.id from authority.record_entry are
left outer join used_authorities ua on ua.linked=are.id
where
ua.linked is null -- we only want authorities that are not in the linked list.
and not are.deleted
order by id
--limit 1000
;
delete from authority.record_entry are
where are.id in
(
select id from unused_auth limit 50000 --limit the number deleted at once
)
;
commit;
-- Re-Enable triggers
ALTER TABLE authority.record_entry ENABLE TRIGGER update_headings_tgr;
ALTER TABLE authority.record_entry ENABLE TRIGGER a_marcxml_is_well_formed;
ALTER TABLE authority.record_entry ENABLE TRIGGER b_maintain_901;
ALTER TABLE authority.record_entry ENABLE TRIGGER c_maintain_control_numbers;
ALTER TABLE authority.record_entry ENABLE TRIGGER map_thesaurus_to_control_set;
--- Done deleting unused authority records
--------------------------
\pset pager off
-- Double check that there are no deleted authority records referenced anywhere.
-- If any results show up here, you need to look into why deleted authority records are being
-- referenced.
begin;
-- Temp table of all deleted authority records.
create temp table deleted_authority_records
on commit drop
as
select id from authority.record_entry where deleted;
-- metabib.browse_entry_def_map
select * from metabib.browse_entry_def_map mbedm
join deleted_authority_records dar on dar.id=mbedm.authority;
-- If any bib records are found here, run them through the authority_control_field.pl -ref -rec <recid>
-- they just need to be refreshed so they no longer point at the deleted bib.
-- vandelay.queued_authority_record
select * from vandelay.queued_authority_record vqar
join deleted_authority_records dar on dar.id=vqar.imported_as;
-- If you find any records here, clear out your authority import qeueues.
-- vandelay.authority_match
select * from vandelay.authority_match vam
join deleted_authority_records dar on dar.id=vam.eg_record;
-- authority.authority_linking Source
select * from authority.authority_linking aal
join deleted_authority_records dar on dar.id=aal.source;
-- authority.authority_linking target
select * from authority.authority_linking aal
join deleted_authority_records dar on dar.id=aal.target;
-- authority.bib_linking
select * from authority.bib_linking abl
join deleted_authority_records dar on dar.id=abl.authority;
-- authority.record_note
select * from authority.record_note arn
join deleted_authority_records dar on dar.id=arn.record
;
-- authority.simple_heading
select * from authority.simple_heading ash
join deleted_authority_records dar on dar.id=ash.record
;
rollback;
-- Look for authorities with duplicate headings, choose the copy that isn't linked to any bibs.
-- the authority.authority_linking will get fixed the next time the authority_authority_linker.pl is run.
-- Also, only select 1 of the records if there are multiple records with no bib linking, to leave at least one record
-- to be a target for authority.authority_linking.
-- Many duplicates would have already been removed by the previous step, but this step will get rid of the rest.
-- Method 1
select heading, count(heading), min(id) deleteme
from
(
select are1.id, are1.heading, are2.id dup_id
from authority.record_entry are1
join authority.record_entry are2 on are2.heading=are1.heading and are1.id != are2.id
where
not are1.deleted
and not are2.deleted
and not exists (select 1 from authority.bib_linking abl where abl.authority=are1.id)
--and not exists (select 1 from authority.authority_linking aal where aal.target=are1.id)
order by are1.heading
) x
group by heading
;
-- Delete those records
begin;
delete from authority.record_entry are
where are.id in
(
select min(id) id
from
(
select are1.id, are1.heading, are2.id dup_id
from authority.record_entry are1
join authority.record_entry are2 on are2.heading=are1.heading and are1.id != are2.id
where
not are1.deleted
and not are2.deleted
and not exists (select 1 from authority.bib_linking abl where abl.authority=are1.id)
--and not exists (select 1 from authority.authority_linking aal where aal.target=are1.id)
order by are1.heading
) x
group by heading
)
;
commit;
-- You should re-run authority_authority_linker.pl at this point.
-- Method 2
-- Another strategy is to just delete one of each of the duplicates, and then re-link the bibs that
-- pointed to the deleted duplicate copy, which should also trigger a reingest and fix metabib.browse_def_map
--
-- Delete one of every duplicate.
begin;
-- Commands to run to re-link the bibs that are affected.
-- After delete is committed, run these commands.
select './authority_control_fields_custom.pl --refresh --record='||bib from
(
select distinct abl.bib from authority.bib_linking abl
where
abl.authority in (
select xx.min id
from
(
select heading, count(id), string_agg(id::text,', ' order by id) authority_ids, min(id)
from authority.record_entry
where
not deleted
group by heading
having count(id) > 1
order by count(id) desc
) xx
)
order by abl.bib
) x
;
delete from authority.record_entry are
where are.id in
(
select x.min id
from
(
select heading, count(id), string_agg(id::text,', ' order by id) authority_ids, min(id)
from authority.record_entry
where
not deleted
group by heading
having count(id) > 1
order by count(id) desc
) x
)
;
commit;
-- Now take the output from the first query and put it in a shell script and run it
-- so all the bibs that had their authorities so cruely taken from them will get reunited
-- with their twin.
-- You should re-run authority_authority_linker.pl at this point.
-- -------------
-- Delete authority records for real
-- How many deleted authorities are there?
select count(id) from authority.record_entry where deleted;
-- Vacumm analyze authority.record_entry to make sure planner knows about current state of table.
VACUUM ANALYZE VERBOSE authority.record_entry;
-- Create several indexes to speed up foreign key checks on delete
-- create index for authority entries on metabib.browse_entry_def_map to speed up deletes.
CREATE INDEX browse_entry_def_map_authority_idx ON metabib.browse_entry_def_map (authority);
-- Create index for authority.authority_linking (source)
CREATE INDEX authority_linking_source_idx on authority.authority_linking (source);
-- Create index for authority.authority_linking (target)
CREATE INDEX authority_linking_target_idx on authority.authority_linking (target);
-- Create index for authority.bib_linking (authority)
CREATE INDEX bib_linking_authority_idx on authority.bib_linking (authority);
begin;
-- Disable delete protect rule in transaction.
ALTER TABLE authority.record_entry DISABLE RULE protect_authority_rec_delete;
-- Perform delete -- explain analyze
delete from authority.record_entry are
where are.id in
(
select id from authority.record_entry where deleted order by id limit 30000
)
;
commit;
-- How many deleted authorities are there?
select count(id) from authority.record_entry where deleted;
-- Enable delete protect rule in transaction.
ALTER TABLE authority.record_entry ENABLE RULE protect_authority_rec_delete;
-- Drop index on authority column of metabib.browse_entry_def_map
DROP INDEX metabib.browse_entry_def_map_authority_idx;
-- Drop index on source column of authority.authority_linking
DROP INDEX authority.authority_linking_source_idx;
-- Drop index on target column of authority.authority_linking
DROP INDEX authority.authority_linking_target_idx;
-- Drop index on authority column of authority.bib_linking
DROP INDEX authority.bib_linking_authority_idx;
-- Vacumm analyze authority.record_entry to make sure planner knows about current state of table.
VACUUM ANALYZE VERBOSE authority.record_entry;
-- To reclaim the removed space, run a vacumm full, but only do that during a maintenance window, since
-- it locks the table and re-writes the whole thing.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment