Skip to content

Instantly share code, notes, and snippets.

@AntonPetrov
Created December 21, 2017 09:51
Show Gist options
  • Save AntonPetrov/9b95000ac13ec7f688ede7ec5770620f to your computer and use it in GitHub Desktop.
Save AntonPetrov/9b95000ac13ec7f688ede7ec5770620f to your computer and use it in GitHub Desktop.
Deduplicate references on pubmed id
CREATE TABLE backup_rnc_reference_map AS SELECT * FROM rnc_reference_map;
-- choose max reference id out of all references with same pmid
CREATE TEMP TABLE rnc_reference_map_max_id AS
select max(id) as max_id, pmid
from rnc_references
where
pmid is not null
and pmid != ''
group by pmid
having count(*) > 1;
-- find row ids in the mapping table that refer to duplicate pmids but don't have the max id
create temp table rnc_reference_ids_to_delete as
select t1.id
from
rnc_reference_map t1, rnc_references t2, rnc_reference_map_max_id t3
where
t1.reference_id = t2.id
and t2.pmid = t3.pmid
and t2.id != t3.max_id;
CREATE INDEX rnc_reference_ids_to_delete$id ON rnc_reference_ids_to_delete (id);
delete from rnc_reference_map where id in (select id from rnc_reference_ids_to_delete);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment