Last active
March 6, 2022 16:24
-
-
Save gubatron/6d7e8d8685c3fda4774ee1f61b846f88 to your computer and use it in GitHub Desktop.
remove repeated entries on a sqlite3 table (there's no delete limit 1 in sqlite3, you gotta do a select within the delete statement)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
# python script to cleanup double entries on the marketsnapshots table. | |
#CREATE TABLE marketsnapshots ( | |
# symbol TEXT, | |
# ... | |
# lastupdatetime INTEGER); | |
def symbolList(cur): | |
sql = 'select distinct symbol from marketsnapshots order by symbol asc'; | |
symbols=[] | |
for row in cur.execute(sql): | |
symbols.append(row[0]) | |
return symbols | |
def findDuplicateTimestamps(symbol, cur): | |
sql='select lastupdatetime from marketsnapshots where symbol = ? order by lastupdatetime asc'; | |
hist={} | |
for row in cur.execute(sql, (symbol,)): | |
ts = row[0] | |
if ts in hist.keys(): | |
hist[ts] = 1 + hist[ts] | |
else: | |
hist[ts] = 1 | |
duplicateTSs = [] | |
for k,v in hist.items(): | |
if v > 1: | |
duplicateTSs.append(k) | |
return duplicateTSs | |
def removeDuplicates(symbol, duplicateTimestamps, cur, con): | |
for ts in duplicateTimestamps: | |
sql = 'delete from marketsnapshots where rowid = (select rowid from marketsnapshots where symbol = ? and lastupdatetime = ? limit 1)'; | |
cur.execute(sql, (symbol, ts,)) | |
con.commit() | |
con = sqlite3.connect("scribe.sqlite") | |
cur = con.cursor() | |
symbols = symbolList(cur) | |
for symbol in symbols: | |
duplicatesForSymbol = findDuplicateTimestamps(symbol, cur) | |
print(f'Duplicates for {symbol}:', len(duplicatesForSymbol)) | |
print(f'Removing duplicates for {symbol}...') | |
removeDuplicates(symbol, duplicatesForSymbol, cur, con) | |
print('='*80) | |
print() | |
con.close() | |
# the sqlite file's size will not change after running the script | |
# you must perform 'VACCUUM;' on the db for the deleted records to be purged from disk |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment