Skip to content

Instantly share code, notes, and snippets.

@skeptrunedev
Created November 30, 2023 22:01
Show Gist options
  • Save skeptrunedev/47bc69cd06359ec40aca4701932ded5f to your computer and use it in GitHub Desktop.
Save skeptrunedev/47bc69cd06359ec40aca4701932ded5f to your computer and use it in GitHub Desktop.
python script to dedup a database by column
import psycopg2
from dotenv import load_dotenv
import os
import json
import requests
# Load the .env file
load_dotenv()
origin_db_url = os.environ.get("ORIGIN_DB_URL")
conn = psycopg2.connect(origin_db_url)
cur = conn.cursor()
conn2 = psycopg2.connect(origin_db_url)
cur2 = conn2.cursor()
cur.execute(
"""SELECT id
FROM card_metadata
WHERE qdrant_point_id IS NULL
AND metadata NOT IN (
SELECT metadata
FROM card_metadata
GROUP BY metadata
HAVING COUNT(*) = 1
);"""
)
while True:
# Fetch 20 rows
rows = cur.fetchmany(10000)
# If no more rows are available, break the loop
if not rows:
break
ids = [row[0] for row in rows]
# delete all the rows from the card_collisions table which have one of the ids as their value for card_metadata_id
# print the number of rows deleted
cur2.execute("""DELETE FROM card_collisions WHERE card_id IN %s""", (tuple(ids),))
num_rows_deleted = cur2.rowcount
print(num_rows_deleted)
# delete all the rows from the card_metadata table which have one of the ids as their value for id
# print the number of rows deleted
cur2.execute("""DELETE FROM card_metadata WHERE id IN %s""", (tuple(ids),))
num_rows_deleted = cur2.rowcount
print(num_rows_deleted)
# commit the changes
conn2.commit()
# Close the cursor and connection
cur.close()
conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment