Last active
November 30, 2021 09:14
-
-
Save avioli/802b35716c5503d5a306daf38bc9ec53 to your computer and use it in GitHub Desktop.
Find duplicate photos in my massive photos library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
set -e | |
cd /volume1/photo/ | |
echo "scanning the photo dir - computing hashes" | |
# Scan the photo dir. | |
# | |
# My collection is on a Synology NAS so it has an @eaDir that holds | |
# some of their computed files - thumbnails and such. | |
# | |
# I've used Rumpus FTP before, which was generating the Rumpus_Thumbs dir. | |
find . -type f -not -path "*/@eaDir/*" -and -not -path "*/Rumpus_Thumbs/*" -and \( -iname "*.jpg" -or -iname "*.jpeg" \) | \ | |
while read file; do \ | |
md5sum "$file" | tee -a hashes.md5; \ | |
done | |
echo "sorting hashes" | |
sort hashes.md5 > sorted_hashes.md5 | |
echo "picking the duplicates" | |
# Pick the duplicates. | |
# | |
# Next we'll be walking the file and if a line's hash is the same as | |
# the previous line's then it is an identical duplicate. | |
awk_script=' | |
{ | |
if (hash==$1) { | |
arr[0]=last; | |
arr[idx++]=$0; | |
found++ | |
} else { | |
idx=1; | |
last=$0; | |
if (found>0) { | |
for (k in arr) { | |
dup[x++]=arr[k] | |
}; | |
found=0; | |
split("",arr) | |
} | |
}; | |
hash=$1 | |
} | |
END { | |
for (k in dup) { | |
print dup[k] | |
} | |
} | |
' | |
# Run the above script on our sorted hashes | |
awk "$awk_script" hashes_sorted.md5 > hashes_duplicates.md5 | |
html_template=$(cat <<'TPL' | |
<!DOCTYPE html><html><head><meta charset="utf-8"><title>Find duplicate photos</title> | |
<style> | |
html, body, div, span, applet, object, iframe, | |
h1, h2, h3, h4, h5, h6, p, blockquote, pre, | |
a, abbr, acronym, address, big, cite, code, | |
del, dfn, em, img, ins, kbd, q, s, samp, | |
small, strike, strong, sub, sup, tt, var, | |
b, u, i, center, | |
dl, dt, dd, ol, ul, li, | |
fieldset, form, label, legend, | |
table, caption, tbody, tfoot, thead, tr, th, td, | |
article, aside, canvas, details, embed, | |
figure, figcaption, footer, header, hgroup, | |
menu, nav, output, ruby, section, summary, | |
time, mark, audio, video { | |
margin: 0; | |
padding: 0; | |
border: 0; | |
font-size: 100%; | |
font: inherit; | |
vertical-align: baseline; | |
} | |
/* HTML5 display-role reset for older browsers */ | |
article, aside, details, figcaption, figure, | |
footer, header, hgroup, menu, nav, section { | |
display: block; | |
} | |
body { | |
line-height: 1; | |
} | |
ol, ul { | |
list-style: none; | |
} | |
blockquote, q { | |
quotes: none; | |
} | |
blockquote:before, blockquote:after, | |
q:before, q:after { | |
content: ""; | |
content: none; | |
} | |
table { | |
border-collapse: collapse; | |
border-spacing: 0; | |
} | |
@font-face { | |
font-family: system; | |
font-style: normal; | |
font-weight: 300; | |
src: local(".SFNSText-Light"), local(".HelveticaNeueDeskInterface-Light"), local(".LucidaGrandeUI"), local("Ubuntu Light"), local("Segoe UI Light"), local("Roboto-Light"), local("DroidSans"), local("Tahoma"); | |
} | |
body { | |
font-family: "system"; | |
padding: 1em; | |
} | |
h2 { font-size: 1.3em; font-weight: bold; } | |
.app--content { padding-top: 2em; } | |
.finder { display: flex; flex-direction: column; min-height: 400px; } | |
.finder--columns { display: flex; flex: 1 0 100%; } | |
.finder--stack { background-color: #efefef; border-right: 1px solid #ddd; } | |
.dir-item a { display: block; padding: .2em .5em; text-decoration: none; } | |
.dir-item--selected { color: white; background-color: #3267dd; } | |
.dir-item--selected a { color: inherit; } | |
</style> | |
</head><body> | |
<div id="app"></div> | |
<script src="https://unpkg.com/react@16/umd/react.development.js"></script> | |
<script src="https://unpkg.com/react-dom@16/umd/react-dom.development.js"></script> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/babel-core/5.8.38/browser.min.js"></script> | |
<script type="text/babel"> | |
const { useState } = React; | |
const imagesHost = "http://10.0.0.1:3333/"; | |
function ImageSelection(props) { | |
const { files } = props; | |
return ( | |
<div> | |
<ul> | |
{files.map((file, idx) => { | |
return ( | |
<li key={idx}> | |
<img src={imagesHost + file} width="640" /> | |
</li> | |
); | |
})} | |
</ul> | |
</div> | |
); | |
} | |
function ImagesPicker(props) { | |
const [selecting, setSelecting] = useState(false); | |
const { files } = props; | |
if (selecting) { | |
return <ImageSelection files={props.files} /> | |
} else { | |
return <div onClick={() => setSelecting(true)}> | |
<img src={imagesHost + files[0]} width="150" /> | |
</div> | |
} | |
} | |
function Images(props) { | |
const { map, hashes } = props.data; | |
const [offset, setOffset] = useState(0); | |
const [count, setCount] = useState(10); | |
const images = hashes.slice(offset, offset + count); | |
const prevPage = () => { | |
const newOffset = offset - count; | |
if (newOffset < 0) return; | |
setOffset(Math.max(newOffset, 0)); | |
}; | |
const nextPage = () => { | |
const newOffset = offset + count; | |
if (newOffset > map.size - 1) return; | |
setOffset(Math.min(newOffset, map.size - 1)); | |
}; | |
return ( | |
<div className="images"> | |
<ul> | |
{images.map((hash) => { | |
const files = map.get(hash); | |
return ( | |
<li key={hash}> | |
<ImagesPicker hash={hash} files={files} /> | |
</li> | |
); | |
})} | |
</ul> | |
<div> | |
<button type="button" onClick={prevPage}>Prev</button> | |
<button type="button" onClick={nextPage}>Next</button> | |
</div> | |
</div> | |
); | |
} | |
function Dir(props) { | |
const { dir, onDir, selected = {}, onFile } = props; | |
const dirs = Array.from(dir.__dirs || []); | |
const files = Array.from(dir.__files || []); | |
const dirItem = (name, idx, onClick) => { | |
const key = "#" + name; | |
const isSelected = name === selected.name; | |
return ( | |
<li key={key} className={["dir-item", isSelected && "dir-item--selected"].filter(Boolean).join(" ")}> | |
<a href={key} onClick={onClick}>{isSelected ? ">" : ""}{name}</a> | |
</li> | |
); | |
}; | |
return ( | |
<div className="dir"> | |
<ul> | |
{dirs.map((name, idx) => dirItem(name, idx, (event) => onDir(event, dir[name], name)))} | |
{files.map(([name, hash], idx) => dirItem(name, idx, (event) => onFile(event, files[idx])))} | |
</ul> | |
</div> | |
); | |
} | |
function Finder(props) { | |
const { dirs, map } = props.data; | |
const [dirStack, setStack] = useState([{ name: "/", dir: dirs, path: "/", width: 150 }]); | |
const setCwd = (dir, name, idx) => { | |
const newStack = dirStack.slice(0, idx + 1); | |
const cwd = dirStack[idx] || {}; | |
if (dir !== cwd.dir) { | |
const { path, width } = cwd; | |
const newPath = path + (idx == 0 ? "" : "/") + name; | |
newStack[idx + 1] = { name, dir, path: newPath, width }; | |
} | |
setStack(newStack); | |
}; | |
const showPreview = (file, idx) => { | |
const { path } = dirStack[idx] || {}; | |
const [name, hash] = file; | |
const filePath = path + "/" + name; | |
const duplicates = map.get(hash); | |
const i = duplicates.findIndex((f) => f.endsWith(filePath)); | |
const newStack = dirStack.slice(0, idx + 1); | |
newStack[idx + 1] = { name, hash, filePath, path, duplicates, index: i, width: 150 }; | |
setStack(newStack); | |
}; | |
return ( | |
<div className="finder"> | |
<div className="finder--columns"> | |
{dirStack.map((stack, idx) => { | |
const { dir, filePath, width } = stack; | |
const nextStack = dirStack[idx + 1] || {}; | |
const { dir: subDir } = nextStack; | |
if (filePath) { | |
return ( | |
<div className="finder--stack" key={idx} style={{minWidth: width}}> | |
<ImagesPicker hash={stack.hash} files={stack.duplicates} /> | |
</div> | |
); | |
} else { | |
return ( | |
<div className="finder--stack" key={idx} style={{minWidth: width}} onClick={() => setCwd(dir, "", idx)}> | |
<Dir | |
dir={dir} | |
onDir={(event, subDir, name) => { | |
event.stopPropagation(); | |
event.preventDefault(); | |
setCwd(subDir, name, idx); | |
}} | |
onFile={(event, file) => { | |
event.stopPropagation(); | |
event.preventDefault(); | |
showPreview(file, idx); | |
}} | |
selected={nextStack} | |
/> | |
</div> | |
); | |
} | |
})} | |
</div> | |
<div>{dirStack[dirStack.length - 1].path}</div> | |
</div> | |
); | |
} | |
function App(props) { | |
const { hashes, filesCount } = props.data; | |
const [mode, setMode] = useState("dirs"); | |
return ( | |
<div className="app"> | |
<div className="app--stats"> | |
<h2>Stats</h2> | |
<ul> | |
<li>Found duplicates: {hashes.length}</li> | |
<li>Total files: {filesCount}</li> | |
</ul> | |
</div> | |
<div className="app--content"> | |
<div className="app--tabs"> | |
<button type="button" onClick={() => setMode("dirs")}>Dirs</button> | |
<button type="button" onClick={() => setMode("images")}>Images</button> | |
</div> | |
<div className="app--tab-content"> | |
{mode == "dirs" ? <Finder data={data} /> : <Images data={data} />} | |
</div> | |
</div> | |
</div> | |
); | |
} | |
ReactDOM.render( | |
<App data={data} />, | |
document.getElementById("app") | |
); | |
</script> | |
</body> | |
</html> | |
TPL | |
) | |
data_script_top=$(cat <<'TPL' | |
<script type="text/javascript"> | |
(function(entries) { | |
const map = new Map(); | |
const dirs = {}; | |
const dirsPool = {}; | |
for (let [file, hash] of entries) { | |
const list = map.get(hash); | |
if (list) { | |
list.push(file); | |
map.set(hash, list); | |
} else { | |
map.set(hash, [file]); | |
} | |
// create the dir struct | |
const parts = file.split("/"); | |
parts.shift(); // remove "./" | |
const filename = parts.pop(); | |
let parent = dirs; | |
let path; | |
for (let i = 0; i < parts.length; i++) { | |
path = parts.slice(0, i + 1).join("/"); | |
const dir = parts[i]; | |
parent[dir] = dirsPool[path] = dirsPool[path] || {}; | |
const _dirs = parent.__dirs = parent.__dirs || new Set(); | |
if ((!_dirs instanceof Set)) throw new Error("FIXME: there is a __dirs dir!!!"); | |
_dirs.add(dir); | |
parent = parent[dir]; | |
} | |
const _files = dirsPool[path].__files = dirsPool[path].__files || []; | |
if (!Array.isArray(_files)) throw new Error("FIXME: there is a __files dir!!!"); | |
_files.push([filename, hash]); | |
} | |
const hashes = Array.from(map.keys()); | |
window.data = { | |
map, | |
dirs, | |
hashes, | |
filesCount: entries.length, | |
}; | |
})([ | |
TPL | |
) | |
function generate_html() { | |
echo "$html_template" | |
echo "$data_script_top" | |
awk '{f=substr($0,35,999);print("[\"" f "\", \"" $1 "\"],")}' hashes_duplicates.md5 | |
echo "]);</script>" | |
} | |
echo "generating html" | |
generate_html > hashes_check.html | |
python3 -m http.server 3333 & | |
PID=$! | |
echo " server pid: $PID" | |
echo " open http://10.0.0.1:3333/hashes_check.html" | |
read -p "Press Enter to kill the server" | |
kill $PID | |
echo "done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
So far the script is computing md5 sum of each Jpeg file in
/volume1/photo
, then sorts the hashes, since the file is in the format "md5hash ./path/to/file".Then I use an AWK script to walk the sorted file line by line and if the value in column $1 is the same as the previous line it will remember it. Then at the end it will output all the found duplicates only.
Following is an HTML template with a very basic React app, which is then written into
hashes_check.html
+ the found duplicates in specific format -[["file", "hash"],...]
, so it can create aMap
and then a directories plain object:At the end it uses Python3 to start a simple http server on port
3333
so thehashes_check.html
file can be opened:My test file ended up 25MB, so it takes awhile to open/transfer it. Maybe I should just generate a json file and, load it from within the app with
fetch
.Next up is adding checkboxes and some way of persisting the selected files, then generating a shell script that will remove/move the files so they can be deleted.