Skip to content

Instantly share code, notes, and snippets.

@gubatron
Last active March 6, 2022 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gubatron/6d7e8d8685c3fda4774ee1f61b846f88 to your computer and use it in GitHub Desktop.
Save gubatron/6d7e8d8685c3fda4774ee1f61b846f88 to your computer and use it in GitHub Desktop.
remove repeated entries on a sqlite3 table (there's no delete limit 1 in sqlite3, you gotta do a select within the delete statement)
import sqlite3
# python script to cleanup double entries on the marketsnapshots table.
#CREATE TABLE marketsnapshots (
# symbol TEXT,
# ...
# lastupdatetime INTEGER);
def symbolList(cur):
sql = 'select distinct symbol from marketsnapshots order by symbol asc';
symbols=[]
for row in cur.execute(sql):
symbols.append(row[0])
return symbols
def findDuplicateTimestamps(symbol, cur):
sql='select lastupdatetime from marketsnapshots where symbol = ? order by lastupdatetime asc';
hist={}
for row in cur.execute(sql, (symbol,)):
ts = row[0]
if ts in hist.keys():
hist[ts] = 1 + hist[ts]
else:
hist[ts] = 1
duplicateTSs = []
for k,v in hist.items():
if v > 1:
duplicateTSs.append(k)
return duplicateTSs
def removeDuplicates(symbol, duplicateTimestamps, cur, con):
for ts in duplicateTimestamps:
sql = 'delete from marketsnapshots where rowid = (select rowid from marketsnapshots where symbol = ? and lastupdatetime = ? limit 1)';
cur.execute(sql, (symbol, ts,))
con.commit()
con = sqlite3.connect("scribe.sqlite")
cur = con.cursor()
symbols = symbolList(cur)
for symbol in symbols:
duplicatesForSymbol = findDuplicateTimestamps(symbol, cur)
print(f'Duplicates for {symbol}:', len(duplicatesForSymbol))
print(f'Removing duplicates for {symbol}...')
removeDuplicates(symbol, duplicatesForSymbol, cur, con)
print('='*80)
print()
con.close()
# the sqlite file's size will not change after running the script
# you must perform 'VACCUUM;' on the db for the deleted records to be purged from disk
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment