Created
December 21, 2017 09:51
-
-
Save AntonPetrov/9b95000ac13ec7f688ede7ec5770620f to your computer and use it in GitHub Desktop.
Deduplicate references on pubmed id
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE backup_rnc_reference_map AS SELECT * FROM rnc_reference_map; | |
-- choose max reference id out of all references with same pmid | |
CREATE TEMP TABLE rnc_reference_map_max_id AS | |
select max(id) as max_id, pmid | |
from rnc_references | |
where | |
pmid is not null | |
and pmid != '' | |
group by pmid | |
having count(*) > 1; | |
-- find row ids in the mapping table that refer to duplicate pmids but don't have the max id | |
create temp table rnc_reference_ids_to_delete as | |
select t1.id | |
from | |
rnc_reference_map t1, rnc_references t2, rnc_reference_map_max_id t3 | |
where | |
t1.reference_id = t2.id | |
and t2.pmid = t3.pmid | |
and t2.id != t3.max_id; | |
CREATE INDEX rnc_reference_ids_to_delete$id ON rnc_reference_ids_to_delete (id); | |
delete from rnc_reference_map where id in (select id from rnc_reference_ids_to_delete); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment