Skip to content

Instantly share code, notes, and snippets.

@genotrance
Created April 19, 2018 18:12
Show Gist options
  • Save genotrance/ef84e2ee7daab54c0a385e92c0a2757d to your computer and use it in GitHub Desktop.
Save genotrance/ef84e2ee7daab54c0a385e92c0a2757d to your computer and use it in GitHub Desktop.
import asyncdispatch
import asynctools
import docopt
import json
import nre
import os
import ospaths
import sequtils
import sha256/sha256sum
import strutils
import tables
import threadpool
import times
# ###
# Constants
const FINGERPRINT_MATCH_THRESHOLD = 0.92
const FINGERPRINT_MATCH_OFFSET = 80
const FINGERPRINT_RELEVANT_BITS = uint32(0xFFFFFF00)
const MAX_THREADS = 4
# ###
# Handle CTRL-C
proc chandler() {.noconv.} =
setupForeignThreadGc()
echo "\nExiting"
quit(1)
setControlCHook(chandler)
# ##
# Maximum number of threads
setMaxPoolSize(MAX_THREADS)
# ###
# FFprobe CLI
#let FFPROBE = ["-hide_banner", "-of", "json", "-v", "quiet", "-show_format", "-show_entries", "format=filename,duration:format_tags=title,artist,album_artist,composer,album,Acoustid Id,MusicBrainz Release Track Id"]
let FFFORMAT = [".mp3", ".m4a", ".ogg", ".flac"]
# ###
# Command line arguments
const DOC = """
Automatic duplicate file finder
Usage:
autodup [options] <sourcedir> [<dupdir>]
Options:
-h --help
Search
-D Search for duplicate files
-E Search for empty directories
-M Search for duplicate music files (requires fpcalc)
Filters
-f Include files only
-d Include directories only
-p <pattern> Include files / directories containing pattern
-P <regex> Include files / directories containing regex
-s <fsize> Include size greater than (in bytes)
-S <fsize> Include size lesser than (in bytes)
-t <time> Include last modified after (in days)
-T <time> Include last modified before (in days)
Actions
-m Move search results
-x Delete search results
-q Quiet - don't display results
"""
var ARGS {.threadvar.}: Table[string, Value]
ARGS = docopt(DOC)
proc getintflag(flag: string): int =
try:
return parseInt($ARGS[flag])
except:
echo "Bad integer input for " & flag & ": " & $ARGS[flag]
quit(1)
# Flags
var SOURCEDIR = "."
if $ARGS["<sourcedir>"] != "nil":
SOURCEDIR = $ARGS["<sourcedir>"]
var DUPDIR {.threadvar.}: string
DUPDIR = "duplicates"
if $ARGS["<dupdir>"] != "nil":
DUPDIR = $ARGS["<dupdir>"]
var PATTERN = ""
if $ARGS["-p"] != "nil":
PATTERN = $ARGS["-p"]
var REGEX = ""
if $ARGS["-P"] != "nil":
REGEX = $ARGS["-P"]
var FMINSIZE = 0
if $ARGS["-s"] != "nil":
FMINSIZE = getintflag("-s")
var FMAXSIZE = 0
if $ARGS["-S"] != "nil":
FMAXSIZE = getintflag("-S")
var TIMEAFTER = 0
if $ARGS["-t"] != "nil":
TIMEAFTER = getintflag("-t")
var TIMEBEFORE = 0
if $ARGS["-T"] != "nil":
TIMEBEFORE = getintflag("-T")
var FILES_ONLY = false
if ARGS["-D"] or ARGS["-M"] or ARGS["-f"]:
FILES_ONLY = true
var DIRS_ONLY = false
if ARGS["-E"] or ARGS["-d"]:
DIRS_ONLY = true
var ADD: BiggestInt = 0
var MATCH_COUNT = 0
var FILE_COUNT = 0
var DIR_COUNT = 0
# ###
# Tables
type
FileSize = object
first: int
hashes: TableRef[string, int]
# [fileindex: filename]
var FILES: seq[string] = @[]
# {filesize: FileSize object}
var SIZES: TableRef[BiggestInt, FileSize] = newTable[BiggestInt, FileSize]()
# [fileindex: [aidx1, aidx2...]]
var SONGS {.threadvar.}: TableRef[int, seq[uint32]]
# {aidx: [fileindex1, fileindex2]}
var AIDX {.threadvar.}: TableRef[uint32, seq[int]]
SONGS = newTable[int, seq[uint32]]()
AIDX = newTable[uint32, seq[int]]()
# ###
# Actions
proc moveaction(file, dupdir: string) =
var dest = dupdir & DirSep & tailDir(file)
try:
createDir(parentDir(dest))
moveFile(file, dest)
except:
echo "Already exists " & dest
proc removeaction(file: string, info: FileInfo) =
if info.kind == pcFile:
if not tryRemoveFile(file):
echo "Failed to remove " & file
elif info.kind == pcDir:
try:
removeDir(file)
except:
echo "Failed to remove dir " & file
proc action(file: string, info: FileInfo, orig = "") =
if ARGS["-m"]:
spawn moveaction(file, DUPDIR)
stdout.write("Moving ")
elif ARGS["-x"]:
spawn removeaction(file, info)
stdout.write("Removing ")
ADD += info.size
MATCH_COUNT += 1
if not ARGS["-q"]:
echo file
if orig != "":
echo " == " & orig
# ###
# Helpers
proc gethash(file: string): Future[string] {.async.} =
let hash = spawn sha256sum(file)
while not hash.isReady():
await sleepAsync(5)
return ^hash
# ###
# Search
proc finddup(idx: int, info: FileInfo) {.async.} =
if SIZES.has_key(info.size):
# Size seen before
let hash = await gethash(FILES[idx])
if SIZES[info.size].hashes != nil:
# Hashes initialized
if SIZES[info.size].hashes.has_key(hash):
# Current hash seen before
action(FILES[idx], info, FILES[SIZES[info.size].hashes[hash]])
else:
# Unique hash
SIZES[info.size].hashes[hash] = idx
else:
# Hashes not initialized
let fhash = await gethash(FILES[SIZES[info.size].first])
SIZES[info.size].hashes = newTable[string, int]()
SIZES[info.size].hashes[fhash] = SIZES[info.size].first
if fhash == hash:
# Current hash same as first hash for size
action(FILES[idx], info, FILES[SIZES[info.size].first])
else:
# Unique hash
SIZES[info.size].hashes[hash] = idx
else:
# First file size
SIZES[info.size] = FileSize(first: idx, hashes: nil)
{.compile: "pg_acoustid/acoustid_compare.c".}
#~ proc match_fingerprints(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint): cfloat {.importc, cdecl, gcsafe.}
#~ proc match_fingerprints2(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint, maxoffset: cint): cfloat {.importc, cdecl, gcsafe.}
proc match_fingerprints3(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint, maxoffset: cint): cfloat {.importc, cdecl, gcsafe.}
proc fpcalc(file: string): Future[JsonNode] {.async, inline.} =
let args = @["-json", "-raw", file]
let data = await execProcess("fpcalc.exe", args=args, options={poUsePath})
var jdata: JsonNode
try:
jdata = parseJson(data.output)
except:
echo "Bad fingerprint: $#" % file
return nil
return jdata
proc acoustid_compare(idx, id: int): Future[float] {.async.} =
var filedata = createSharedU(uint32, SONGS[idx].len())
var fdata = createSharedU(uint32, SONGS[id].len())
filedata.copyMem(addr SONGS[idx][0], SONGS[idx].len() * sizeof(uint32))
fdata.copyMem(addr SONGS[id][0], SONGS[id].len() * sizeof(uint32))
let match = spawn match_fingerprints3(filedata, cint(SONGS[idx].len()), fdata, cint(SONGS[id].len()), FINGERPRINT_MATCH_OFFSET)
while not match.isReady():
await sleepAsync(5)
filedata.freeShared()
fdata.freeShared()
return ^match
proc findmusicdup(idx: int, info: FileInfo) {.async.} =
let jdata = await fpcalc(FILES[idx])
if jdata == nil:
return
SONGS[idx] = @[]
for i in jdata{"fingerprint"}.items:
let aidx: uint32 = uint32(i.getNum()) and FINGERPRINT_RELEVANT_BITS
SONGS[idx].add(aidx)
var compare: seq[int] = @[]
for aidx in SONGS[idx].deduplicate():
if AIDX.has_key(aidx):
for id in AIDX[aidx]:
# Don't compare same two files multiple times
if not compare.contains(id):
let match = await acoustid_compare(idx, id)
if match > FINGERPRINT_MATCH_THRESHOLD:
action(FILES[idx], info, FILES[id])
SONGS.del(idx)
return
else:
compare.add(id)
# Not a duplicate, add to AIDX index for easy comparison
for aidx in SONGS[idx].deduplicate():
if AIDX.has_key(aidx):
if not AIDX[aidx].contains(idx):
AIDX[aidx].add(idx)
else:
AIDX[aidx] = @[idx]
proc findempty(dir: string, info: FileInfo) =
var empty = true
for sf in walkPattern(dir & DirSep & "*"):
empty = false
break
if empty:
action(dir, info)
# ###
# Scan
proc recurse(dir: string) =
let now = getTime()
let after = initInterval(days=TIMEAFTER)
let before = initInterval(days=TIMEBEFORE)
for file in walkPattern(dir & DirSep & "*"):
var info: FileInfo
try:
info = getFileInfo(file)
except:
continue
if info.kind == pcFile:
FILE_COUNT += 1
# Skip files
if DIRS_ONLY:
continue
elif info.kind == pcDir:
DIR_COUNT += 1
recurse(file)
# Skip directories
if FILES_ONLY:
continue
# Skip if doesn't match pattern
let (_, name, ext) = splitFile(file)
if PATTERN != "":
if not (name & ext).contains(PATTERN):
continue
if REGEX != "":
if not (name & ext).contains(re(REGEX)):
continue
# Skip if smaller than
if $ARGS["-s"] != "nil":
if info.size < FMINSIZE:
continue
# Skip if larger than
if $ARGS["-S"] != "nil":
if info.size > FMAXSIZE:
continue
# Skip if older than
if $ARGS["-t"] != "nil":
if info.lastWriteTime < now - after:
continue
# Skip if newer than
if $ARGS["-T"] != "nil":
if info.lastWriteTime > now - before:
continue
if info.kind == pcFile:
# Don't process file multiple times
if not FILES.contains(file):
FILES.add(file)
if ARGS["-D"]:
asyncCheck finddup(FILES.len()-1, info)
elif ARGS["-M"]:
if file.splitFile().ext.toLowerAscii() in FFFORMAT:
asyncCheck findmusicdup(FILES.len()-1, info)
else:
action(file, info)
elif info.kind == pcDir:
if ARGS["-E"]:
findempty(file, info)
else:
action(file, info)
sync()
try:
runForever()
except:
discard
if not ARGS["-q"]:
stdout.write("$# matches: $# MB / $# dirs, $# files\r" % [$MATCH_COUNT, formatFloat(float(ADD)/1024/1024, ffDecimal, 2), $DIR_COUNT, $FILE_COUNT])
stdout.flushFile()
# ###
# Main
recurse(SOURCEDIR)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment