Last active
March 18, 2023 19:09
-
-
Save jhubble/88c6118212f5dfd06d5f80d1babea978 to your computer and use it in GitHub Desktop.
find duplicate songs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compare to mediainfo json files generated with mediainfo . --Output=JSON | |
// run with node findDupMusic.js source.json possibleDups.json | |
// Will output files in possibleDups that are in source | |
// add trailing -delete option to delete these duplicate files | |
// A trailing -fource after delete will delete even if source file not present | |
// All options are super dumb and dependent on place | |
const fs = require("fs"); | |
const getOptions = (args) => { | |
const options = {}; | |
let opt = ''; | |
for (let i=2;i<args.length;i++) { | |
// -optes are boolean | |
if (args[i].startsWith('-')) { | |
opt = args[i]; | |
if (opt === '-match') { | |
options[opt] = []; | |
} | |
else { | |
options[opt] = true; | |
} | |
} | |
// assume a value without a opt is the value for the previous opt item | |
else if (opt) { | |
if (opt === '-match') { | |
options[opt].push(args[i]); | |
} | |
else { | |
options[opt] = args[i]; | |
} | |
} | |
} | |
return options; | |
} | |
const LOG_LEVEL = { | |
FATAL: 0, | |
ERROR: 1, | |
WARN: 2, | |
INFO: 3, | |
DEBUG: 4, | |
TRACE: 5, | |
ALL: 6 | |
} | |
const log = (level, ...message) => { | |
if (level <= verbosity) { | |
console.log(...message); | |
if (level <= LOG_LEVEL.WARN) { | |
console.error(...message); | |
} | |
} | |
} | |
const showHelp = () => { | |
console.log(`USAGE: | |
Before running, use mediainfo command to get json files of the source and target directories: | |
mediainfo sourceDir --Output=JSON > source.json | |
mediainfo duplicateDir --Output=JSON > dups.json | |
Then run to clean up duplicate songs | |
node findDupMusic.js -source source.json -dups target.json -delete | |
OPTIONS: | |
-source file.json mediainfo file of source | |
-dups file.json mediainfo file of duplicates | |
-delete delete files (without, just warns of deletes) | |
-verbose 2 set verbosity. 0 is least verbose, 6 is most; default is 3 | |
-clean clean up files that are "bad", but not matches ("poor quality" tags) (need -force and -delete to delete) | |
-force force delete, even if source file not present | |
-takeout treat dups as re-encoded google music takeouts with high bitrate, low quality. also those with 160 bit rate mp3 where source is aac | |
-takeout nuke delete any dup items with matching key regardless of other checks (dangerous) | |
-encode compare by encode date [can be risky] | |
-size compare by size [can be risky] | |
-various treat "various artists" in dup as any artist in source | |
-littleworse remove dups even if they have bit rate up to 5000 worse than source | |
-noparens remove anything in parenthesis when generating album/title/artist key | |
-small if duration is <30 seconds in dup, and greater in source, delete dup | |
-dashtitles assume a dash in title name delimites an artist and filter it out | |
-track match by artist, album and track number (instead of track title) | |
-h show this help | |
-match VAL MORE Delete any files where key matches value. Must be at least 8 characters | |
(key is ARTIST-ALBUM-SONG string in all uppercase, with articles | |
and non--alphanumeric stripped and cut to 18 characters each, | |
with dashes separating in each item. | |
Can specify multiple (need -delete and -force to delete) | |
-notitle don't match by title (default is to always match) | |
`); | |
} | |
const options = getOptions(process.argv); | |
if (!options['-source'] && !options['-dups']) { | |
console.error("Minimum usage: node findDupMusic.js -source fileWithSource.json -dups fileWithDups.json"); | |
showHelp(); | |
process.exit(); | |
} | |
if (options['-h']) { | |
showHelp(); | |
process.exit(); | |
} | |
const fileWithSource = options['-source']; | |
const fileWithDups = options['-dups']; | |
const deletedFiles = {}; | |
const verbosity = (options.hasOwnProperty('-verbose') && options.verbosity !== true) ? options['-verbose']-0 : 3; | |
console.log("VERBOSITY:",verbosity); | |
let deleteCount = 0; | |
let attemptCount = 0; | |
// mediainfo . --Output=JSON >allMediaFiles.json | |
const getMeta = (file => { | |
const meta = file.media.track.find((track) => track["@type"] === "General"); | |
const filename = file.media["@ref"]; | |
meta.filename = filename; | |
return meta; | |
}); | |
const getTitle = (item) => { | |
return `\t${item.Performer}\t${item.Album}\t${item.Title}\t${item.Genre}\t${item.OverallBitRate}\t${item.Duration}\t${item.Format}\t${item.Encoded_application ||''}\t${item.Encoded_Date ||''}\t${item.Groupin}\t`; | |
} | |
const getDetails = (src, dup) => { | |
return `details<\t BR: S:${src.OverallBitRate} D:${dup.OverallBitRate}\t FMT: S:${src.Format} D:${dup.Format}\t DUR: S:${src.Duration}, D:${dup.Duration} (${(src.Duration-dup.Duration).toFixed(2)})\t ENC: S:${src.Encoded_Application || ''};${src.Encoded_library||''};${src.Encoded_Date || ''}, D: ${dup.Encoded_Application || ''};${dup.Encoded_library||''};${dup.Encoded_Date ||''}\t SIZE: S:${src.FileSize}, D:${dup.FileSize}\t Album: S:${src.Album},D:${dup.Album}\t File: S:${src.filename},D:${dup.filename} >`; | |
} | |
const cleanTxt = (txt) => { | |
//txt = txt.replace(/["_:']/g,'') | |
//Strip non-alphanumeric (some music managers mangle various characters) | |
// don't try to proces empty or null (or 0 or false...) | |
if (!txt || (typeof txt !== 'string')) { | |
return txt; | |
} | |
let newTxt = txt; | |
if (options['-noparens']) { | |
newTxt = newTxt.replace(/\([^\)]*\)/g,''); | |
} | |
// remove "the", reduce to 30 characters and strip to alphanumeric | |
// | |
// handle some mangling of special characters | |
newTxt = newTxt.replace(/Å™/g,'r'); | |
// Doctor to dr | |
newTxt = newTxt.replace(/\bdoctor\b/ig,'dr'); | |
// No. to # | |
newTxt = newTxt.replace(/\bNo./ig, '#'); | |
newTxt = newTxt.replace(/\bthe\b/ig, ''); | |
newTxt = newTxt.replace(/\.mp3$/,''); | |
newTxt = newTxt.replace(/\band\b/ig, ''); | |
newTxt = newTxt.replace(/\ba\b/ig, ''); | |
newTxt = newTxt.replace(/\bcd\b/ig, 'disc'); | |
newTxt = newTxt.toUpperCase(); | |
newTxt = newTxt.replace(/\bSIXTEEN\b/g,'16'); | |
newTxt = newTxt.replace(/\[[^\]]+\]/,''); | |
newTxt = newTxt.replace(/[\W_]/g, ''); | |
newTxt = newTxt.substring(0, 18); | |
// If we do strip everything, just return the original | |
if (!newTxt) { | |
return txt; | |
} | |
return newTxt; | |
} | |
const getKey = (Performer = '', Album = '', Title = '') => { | |
let key = (cleanTxt(Performer) + "-" + cleanTxt(Album).replace(/DISC\d+$/,'') + "-" + cleanTxt(Title)); | |
return key; | |
} | |
const keyFromFileName = (meta) => { | |
try { | |
const filename = meta.filename; | |
// If we have a bit rate, try to fill in missing parts of metadata | |
if (meta.OverallBitRate) { | |
const parts = filename.split('/'); | |
const file = parts.pop().replace(/\.[^.]*$/,''); | |
const album = parts.pop(); | |
let [artist,...trackBits] = file.split('-'); | |
let track = trackBits.join('-'); | |
artist = artist.trim(); | |
track = track.trim(); | |
if (!track) { | |
track = artist; | |
} | |
meta.Performer = meta.Performer || artist; | |
meta.Album = meta.Album || album; | |
meta.Title = meta.Title || track; | |
// clean up artist/title titles | |
if (options['-dashtitles']) { | |
if (/-/.test(meta.Title) && !/-/.test(track)) { | |
meta.Title = track; | |
} | |
} | |
} | |
log(LOG_LEVEL.DEBUG,`Artist: ${meta.Performer}, Album: ${meta.Album}, Track: ${meta.Title}, File: ${filename}`); | |
return getKey(meta.Performer, meta.Album, meta.Title); | |
} | |
catch (e) { | |
log(LOG_LEVEL.DEBUG,"Unable to extract tracks from filename",filename); | |
} | |
return null; | |
} | |
const indexFiles = (filename) => { | |
log(LOG_LEVEL.INFO,"FILE>>", filename); | |
const index = {byTitle:{}, bySize:{}, byEncodedDate:{}, noAlbum:{}, noArtist:{}, noTrack:{}}; | |
const mediacontents = JSON.parse(fs.readFileSync(filename, "utf8")); | |
mediacontents.forEach((file) => { | |
try { | |
const meta = getMeta(file); | |
let { | |
filename, Track = "", Title = "", Album = "", Performer = "", Duration, OverallBitRate | |
} = meta; | |
const sizeKey = `${OverallBitRate}-${Duration}`; | |
if (Track !== Title) { | |
log(LOG_LEVEL.DEBUG,"different track title:", Track, Title); | |
} | |
// Will try to fill in any missing parts of key | |
let key = keyFromFileName(meta); | |
log(LOG_LEVEL.TRACE,key, OverallBitRate, filename); | |
if (key !== '--') { | |
if (!index.byTitle[key]) { | |
index.byTitle[key] = []; | |
} | |
index.byTitle[key].push(meta); | |
// without album | |
const noAlbumKey = getKey(meta.Performer,'',meta.Title); | |
if (!index.noAlbum[noAlbumKey]) { | |
index.noAlbum[noAlbumKey] = []; | |
} | |
index.noAlbum[noAlbumKey].push(meta); | |
// without artist | |
const noArtistKey = getKey('',meta.Album,meta.Title); | |
if (!index.noArtist[noArtistKey]) { | |
index.noArtist[noArtistKey] = []; | |
} | |
index.noArtist[noArtistKey].push(meta); | |
// with track number instead of track | |
const trackKey = getKey(meta.Performer, meta.Album, meta.Track_Position); | |
if (!index.noTrack[trackKey]) { | |
index.noTrack[trackKey] = []; | |
} | |
index.noTrack[trackKey].push(meta); | |
} | |
else { | |
log(LOG_LEVEL.DEBUG,"Still no tags",filename,getTitle(meta)); | |
if (meta.FileSize <10000) { | |
log(LOG_LEVEL.DEBUG,"SMALL_FILE:",filename,meta.FileSize); | |
} | |
} | |
if (OverallBitRate && Duration) { | |
if (!index.bySize[sizeKey]) { | |
index.bySize[sizeKey] = []; | |
} | |
index.bySize[sizeKey].push(meta); | |
} | |
if (meta.Encoded_Date) { | |
if (!index.byEncodedDate[meta.Encoded_Date]) { | |
index.byEncodedDate[meta.Encoded_Date] = []; | |
} | |
index.byEncodedDate[meta.Encoded_Date].push(meta); | |
} | |
} catch (e) { | |
log(LOG_LEVEL.DEBUG,e, file); | |
} | |
}); | |
log(LOG_LEVEL.INFO,"INDEX LENGTH:", Object.keys(index.byTitle).length, Object.keys(index.bySize).length, Object.keys(index.byEncodedDate).length); | |
return index; | |
}; | |
// second number is verbosity: 2 will show every last key | |
const sourceIndex = indexFiles(fileWithSource, verbosity); | |
const dupIndex = indexFiles(fileWithDups, verbosity); | |
log(LOG_LEVEL.INFO,Object.keys(sourceIndex.byTitle).length, Object.keys(dupIndex.byTitle).length); | |
// delete a target that is identical to the source. | |
// As a sanity check, do not deltee the file if it matches against itself | |
// Also, only delete if source exists (unless -force flag is set) | |
const deleteItem = (dupMeta, sourceMeta, songKey, reason='') => { | |
const sizeDiff = sourceMeta.FileSize - dupMeta.FileSize; | |
if(deletedFiles[dupMeta.filename]) { | |
log(LOG_LEVEL.INFO,`${dupMeta.filename} already deleted, not deleting`); | |
return; | |
} | |
if (sourceMeta.filename === dupMeta.filename) { | |
log(LOG_LEVEL.WARN,"SAME: source and destination are the same, not deleting", sourceMeta.filename, dupMeta.filename); | |
return; | |
} else { | |
log(LOG_LEVEL.INFO,`DUP: [${reason}]<${songKey}> ${sizeDiff} : ${getDetails(sourceMeta, dupMeta)}`); | |
} | |
const doDelete = (filename) => { | |
attemptCount++; | |
if (options['-delete']) { | |
fs.unlinkSync(filename); | |
deleteCount++; | |
} | |
else { | |
log(LOG_LEVEL.WARN,"--- (not deleting since flag not set)"); | |
} | |
deletedFiles[filename] = 1; | |
} | |
try { | |
if (fs.existsSync(sourceMeta.filename)) { | |
log(LOG_LEVEL.WARN,`Deleting:\t${dupMeta.filename}\tSource:\t${sourceMeta.filename}`); | |
doDelete(dupMeta.filename); | |
} else if (options['-force']) { | |
log(LOG_LEVEL.WARN,"forcing deletion even though source does not exist"); | |
doDelete(dupMeta.filename); | |
} else if (options['-clean']) { | |
log(LOG_LEVEL.WARN,"cleaning unwanted mp3s even though source does not exist"); | |
doDelete(dupMeta.filename); | |
} else { | |
log(LOG_LEVEL.WARN,"source does not exist, not deleting"); | |
} | |
} catch(e) { | |
log(LOG_LEVEL.ERROR,"error deleting", e); | |
} | |
} | |
// Iterate through duplicate directory by key (album, title, artist). Compare each file one by one. | |
// Then iterate with each source file with the same key | |
// Then compare each source and duplicate file | |
// 1. If Duration and Bitrate are same: | |
// a. If Filesizes are the same it is a dup | |
// b. If source is between 0 and 12500 bytes more, it is a dup | |
// c. If source is between 0 and 3000 bytes less it is a dup | |
// d. Otherwise, it is not a dup | |
// 2. If the difference in duration is less than 4 seconds and less then 2% of source duration | |
// a. And bitrate of source is greater or equal to that of dup, it is a dup | |
// b. otherwise it is not a dup | |
// | |
// | |
const findDupByTitle = (subIndex = 'byTitle', keyCheck=null, keySubIndex) => { | |
// A filter is done on the "keySubIndex", then the filtered items are used on subindex. | |
// Thus we may filter on all artists equal to "various artists", but do actual work on index without artist name | |
log(LOG_LEVEL.INFO,"\n...Finding dups by titile: subindex:",subIndex, "key filter:",keyCheck ); | |
let filteredDupList =[]; | |
if (keyCheck && keySubIndex) { | |
const keyRegex = new RegExp('\\b('+keyCheck+')\\b'); | |
log(LOG_LEVEL.INFO,"Filtering on ",keyCheck); | |
Object.keys(dupIndex[keySubIndex]).forEach(k => { | |
if (keyRegex.test(k)) { | |
k = k.replace(keyRegex,''); | |
if (dupIndex[subIndex][k]) { | |
filteredDupList.push(k); | |
} | |
else { | |
log(LOG_LEVEL.TRACE,`${k} not found in ${subIndex}`); | |
} | |
} | |
}); | |
} | |
else { | |
filteredDupList = Object.keys(dupIndex[subIndex]); | |
} | |
filteredDupList.forEach(songKey => { | |
if (sourceIndex[subIndex][songKey]) { | |
dupIndex[subIndex][songKey].forEach(dupMeta => { | |
let dups = 0; | |
const sourceSongs = sourceIndex[subIndex][songKey]; | |
let reason =''; | |
sourceSongs.forEach(sourceMeta => { | |
// don't compare to self | |
if (sourceMeta.filename === dupMeta.filename) { | |
return; | |
} | |
const sizeDiff = sourceMeta.FileSize - dupMeta.FileSize; | |
const durationDiff = sourceMeta.Duration - dupMeta.Duration; | |
if (!dups) { | |
// 1. Same bitrate and duration with similar file size | |
if ( | |
sourceMeta.OverallBitRate === dupMeta.OverallBitRate && | |
sourceMeta.Duration == dupMeta.Duration && | |
(sourceMeta.Duration > 0 || ((dupMeta.Format === sourceMeta.Format) && dupMeta.Format && dupMeta.Format.indexOf('Audio') !== -1 )) | |
) { | |
if (sourceMeta.FileSize === dupMeta.FileSize) { | |
log(LOG_LEVEL.DEBUG,">>>EQUAL SIZE FILES"); | |
deleteItem(dupMeta, sourceMeta, songKey, 'BITDUR_EQUAL'+subIndex); | |
dups++; | |
} else if ((sizeDiff < 15000) && (sizeDiff > 0)) { | |
log(LOG_LEVEL.DEBUG,">>>less than 15000 more"); | |
deleteItem(dupMeta, sourceMeta, songKey, 'BITDUR_SRC_BIGGER'+subIndex); | |
dups++; | |
} else if ((Math.abs(sizeDiff/sourceMeta.FileSize) < .03)) { | |
log(LOG_LEVEL.DEBUG,">>> size within 2%"); | |
deleteItem(dupMeta, sourceMeta, songKey, 'BITDUR_SIZE_3_PERCENT'+subIndex); | |
dups++; | |
} else if ((options['-takeout']) && (sizeDiff >0)) { | |
log(LOG_LEVEL.DEBUG,`DELETING EVEN WITH SMALLER SIZE (TAKEOUT OVERRIDE) ${getDetails(sourceMeta, dupMeta)}`); | |
deleteItem(dupMeta, sourceMeta, songKey, 'BITDUR_SRC_MUCH_BIGGER__TAKEOUT'+subIndex); | |
dups++; | |
} else { | |
log(LOG_LEVEL.DEBUG,"SIZE TOO DIFFERENT", sourceMeta.FileSize - dupMeta.FileSize, sizeDiff, "SIZEDIFF:", `<${songKey}>`, dupMeta.filename, dupMeta.FileSize, dupMeta.File_Modified_Date, "SOURCE->", sourceMeta.filename, sourceMeta.FileSize, sourceMeta.File_Modified_Date); | |
reason += `[ SIZEDIFF: ${sizeDiff}, (${getDetails(sourceMeta, dupMeta)}) ]`; | |
} | |
} | |
// 2. Duration and dup bit rate is close or worse | |
else if ((Math.abs(durationDiff) < 6) && ((durationDiff / sourceMeta.Duration) < .03) | |
||(durationDiff <10 && durationDiff > 0)) { | |
log(LOG_LEVEL.DEBUG,"CLOSETIME:", durationDiff, "SIZEDIFF:", sizeDiff, `BIT RATE: SRC:`, sourceMeta.OverallBitRate, 'Duplicate:', dupMeta.OverallBitRate, `<${songKey}>`, dupMeta.filename, dupMeta.Duration, dupMeta.File_Modified_Date, "Source->", sourceMeta.filename, sourceMeta.Duration, sourceMeta.File_Modified_Date, "?", dupMeta.OverallBitRate < sourceMeta.OverallBitRate, sourceMeta.OverallBitRate - dupMeta.OverallBitRate); | |
let bitDiff = sourceMeta.OverallBitRate - dupMeta.OverallBitRate; | |
// 2.a. dup is worse bit rate | |
if ((sourceMeta.OverallBitRate - dupMeta.OverallBitRate) >= 0) { | |
log(LOG_LEVEL.DEBUG,"ok to delete (dup bit rate equal or worse than source)"); | |
deleteItem(dupMeta, sourceMeta, songKey, 'CLOSEDUR_SRCBRBIGGER'+subIndex); | |
dups++; | |
} | |
// 2.b. dup is a likely aac recoded to much higher google music | |
else if ((options['-takeout'] && dupMeta.OverallBitRate == 160000) && | |
(dupMeta.Format === 'MPEG Audio' )) { | |
log(LOG_LEVEL.DEBUG,"dup has better bit rate, but looks like google music re-encoding junk", getDetails(sourceMeta, dupMeta)); | |
deleteItem(dupMeta, sourceMeta, songKey, 'CLOSEDUR_LAMERECODE'+subIndex); | |
} | |
// 2.c. dup is an mp3 only slightly higher bitrate than aac | |
else if (bitDiff > -10000 && | |
dupMeta.Format === 'MPEG Audio' && sourceMeta.Format === 'MPEG-4') { | |
log(LOG_LEVEL.DEBUG,"dup has lame encoded mp3 of slightly better bit rate than AAC source, keeping source (possible google music junk)", bitDiff, getDetails(sourceMeta,dupMeta)); | |
deleteItem(dupMeta, sourceMeta, songKey, 'CLOSEDUR_AAC_TO_MP3'+subIndex); | |
} | |
else if (options['-littleworse'] && (bitDiff > -5000 && bitDiff <=0 )) { | |
log(LOG_LEVEL.DEBUG,"dup has has only slightly better bitrate than source", bitDiff, getDetails(sourceMeta,dupMeta)); | |
deleteItem(dupMeta, sourceMeta, songKey, 'CLOSEDUR_DUP_BETTER'+subIndex); | |
} | |
else { | |
log(LOG_LEVEL.DEBUG,"BETTER BIT RATE THAN SOURCE, not deleting", dupMeta.Encoded_Library,dupMeta.OverallBitRate); | |
reason += `[DP BETTER BR: ${bitDiff} (${getDetails(sourceMeta,dupMeta)}) ]`; | |
} | |
} else if (options['-small'] && dupMeta.Duration <=30 && sourceMeta.Duration > 30) { | |
log(LOG_LEVEL.DEBUG,"small duplicate song, deleting",dupMeta.duration, sourceMeta.duration); | |
deleteItem(dupMeta, sourceMeta, songKey, "SMALL_FILE",subIndex); | |
} else if (options['-small'] && options['-takeout'] && | |
(durationDiff > 0 && (sourceMeta.OverallBitRate - dupMeta.OverallBitRate) >-30000)) { | |
log(LOG_LEVEL.DEBUG,"takeout duplicate key that is smaller and with bit rate no more than 30k more than source",dupMeta.duration, sourceMeta.duration); | |
deleteItem(dupMeta, sourceMeta, songKey, "SMALLER_TAKEOUT",subIndex); | |
} else if (options['-takeout'] && options['-takeout'] === 'nuke') { | |
log(LOG_LEVEL.DEBUG,"takeout duplicate key found. Nuke option set, so deleting",dupMeta.duration, sourceMeta.duration); | |
deleteItem(dupMeta, sourceMeta, songKey, "NUKE_TAKEOUT",subIndex); | |
} else { | |
log(LOG_LEVEL.DEBUG,sourceMeta.FileSize - dupMeta.FileSize, "BITDIFF:", `<${songKey}>`, dupMeta.filename, dupMeta.OverallBitRate, dupMeta.Duration, dupMeta.File_Modified_Date, "SOURCE->", sourceMeta.filename, sourceMeta.OverallBitRate, sourceMeta.Duration, sourceMeta.File_Modified_Date); | |
reason += `[BITDIFF: (${getDetails(sourceMeta,dupMeta)}) ]`; | |
} | |
} | |
}); | |
if (!dups) { | |
log(LOG_LEVEL.INFO,"^^^UNIQUE: <", songKey, ">", getTitle(dupMeta),dupMeta.filename, reason); | |
} | |
}); | |
} else { | |
const m = dupIndex[subIndex][songKey][0]; | |
log(LOG_LEVEL.INFO,"^^NOSOURCE:<", songKey, ">", getTitle(m),dupIndex[subIndex][songKey].length, m.filename, m.OverallBitRate, "COMMENTS:\t",m.Comment); | |
} | |
}); | |
} | |
// Clean out songs based on certain data (like mp3.com comment) | |
cleanBadSongs = () => { | |
Object.keys(dupIndex.byTitle).forEach(songKey => { | |
dupIndex.byTitle[songKey].forEach(m => { | |
log(LOG_LEVEL.DEBUG,"--->",songKey, m.Comment); | |
if(m.Comment && m.Comment.includes('mp3.com')) { | |
log(LOG_LEVEL.DEBUG,"MP3 SAMPLER:<", songKey, ">", getTitle(m),dupIndex.byTitle[songKey].length, m.filename, m.OverallBitRate, "COMMENTS:",m.Comment); | |
deleteItem(m, {}, songKey, 'CLEAN_MP3_COM'); | |
} | |
if (m.Comment && m.Comment.includes('riffage.com')) { | |
log(LOG_LEVEL.DEBUG,"RIFFAGE SAMPLER:<", songKey, ">", getTitle(m),dupIndex.byTitle[songKey].length, m.filename, m.OverallBitRate, "COMMENTS:",m.Comment); | |
deleteItem(m, {}, songKey, 'CLEAN_RIFFAGE'); | |
} | |
if (m.Comment && (m.Comment.includes('poor quality') || m.Comment.includes('muddy'))) { | |
log(LOG_LEVEL.DEBUG,"POOR QUALITY:<", songKey, ">", getTitle(m),dupIndex.byTitle[songKey].length, m.filename, m.OverallBitRate, "COMMENTS:",m.Comment); | |
deleteItem(m, {}, songKey, 'CLEAN_QUALITY_COMMENT'); | |
} | |
if (m.Grouping && (m.Grouping.includes('Poor quality') || m.Grouping.includes('poor quality'))) { | |
log(LOG_LEVEL.DEBUG,"POOR QUALITY GROUPING:<", songKey, ">", getTitle(m),dupIndex.byTitle[songKey].length, m.filename, m.OverallBitRate, "GROUPING:",m.Grouping); | |
deleteItem(m, {}, songKey, 'CLEAN_QUALITY_GROUPING'); | |
} | |
if (m.FileSize <10000 && /.mp3$/.test(m.filename)) { | |
log(LOG_LEVEL.DEBUG,"SMALL_MP3:",m.filename,m.FileSize, 'CLEAN_SMALL_MP3'); | |
deleteItem(m, {}, songKey); | |
} | |
}); | |
}); | |
} | |
// Blind size comparison. | |
// Similar to other, but ignores title and focuses on numbers | |
// | |
const sizeCompare = () => { | |
log(LOG_LEVEL.INFO,"----- Comparing size -----"); | |
Object.keys(dupIndex.bySize).forEach(songKey => { | |
if (sourceIndex.bySize[songKey]) { | |
dupIndex.bySize[songKey].forEach(dupMeta => { | |
let dups = 0; | |
const sourceSongs = sourceIndex.bySize[songKey]; | |
sourceSongs.forEach(sourceMeta => { | |
const sizeDiff = sourceMeta.FileSize - dupMeta.FileSize; | |
const durationDiff = sourceMeta.Duration - dupMeta.Duration; | |
if (!dups) { | |
if (sourceMeta.FileSize === dupMeta.FileSize) { | |
log(LOG_LEVEL.DEBUG,">>>EQUAL FILES"); | |
deleteItem(dupMeta, sourceMeta, songKey, 'SIZE_EQUAL'); | |
dups++; | |
} else if (sourceMeta.Encode_Date === dupMeta.Encode_Date && sourceMeta.Encode_Date) { | |
log(LOG_LEVEL.DEBUG,">>>Same encode date sizeDiff:",sizeDiff); | |
deleteItem(dupMeta, sourceMeta, songKey, 'SIZE_SAME_ENCODE_DATE'); | |
dups++; | |
} else { | |
log(LOG_LEVEL.DEBUG,"SIZE TOO DIFFERENT", sourceMeta.FileSize - dupMeta.FileSize, sizeDiff, "SIZEDIFF:", `<${songKey}>`, dupMeta.filename, dupMeta.FileSize, dupMeta.File_Modified_Date, "SOURCE->", sourceMeta.filename, sourceMeta.FileSize, sourceMeta.File_Modified_Date); | |
} | |
} | |
}); | |
if (!dups) { | |
log(LOG_LEVEL.INFO,"unique size: <", songKey, ">",getTitle(dupMeta), dupMeta.filename); | |
} | |
}); | |
} else { | |
const m = dupIndex.bySize[songKey][0]; | |
log(LOG_LEVEL.INFO,"NOSIZESOURCE size:<", songKey, ">", dupIndex.bySize[songKey].length, m.filename, m.OverallBitRate); | |
} | |
}); | |
} | |
const encodeDateCompare = () => { | |
log(LOG_LEVEL.INFO,"----- Comparing encode date -----"); | |
Object.keys(dupIndex.byEncodedDate).forEach(songKey => { | |
if (sourceIndex.byEncodedDate[songKey]) { | |
dupIndex.byEncodedDate[songKey].forEach(dupMeta => { | |
let dups = 0; | |
const sourceSongs = sourceIndex.byEncodedDate[songKey]; | |
sourceSongs.forEach(sourceMeta => { | |
const sizeDiff = sourceMeta.FileSize - dupMeta.FileSize; | |
const durationDiff = sourceMeta.Duration - dupMeta.Duration; | |
if (!dups) { | |
if (sourceMeta.FileSize === dupMeta.FileSize) { | |
log(LOG_LEVEL.DEBUG,">>>EQUAL FILES"); | |
deleteItem(dupMeta, sourceMeta, songKey, 'ENCODE_DATE_EQUAL'); | |
dups++; | |
} else if ((sizeDiff < 12500) && (sizeDiff > 0)) { | |
log(LOG_LEVEL.DEBUG,">>>less than 12500 more", 'ENCODE_DATE_SRC_BIGGER'); | |
deleteItem(dupMeta, sourceMeta, songKey); | |
dups++; | |
} else if ((sizeDiff > -3000) && (sizeDiff < 0)) { | |
log(LOG_LEVEL.DEBUG,">>>-3000 to 0"); | |
deleteItem(dupMeta, sourceMeta, songKey, 'ENCODE_DATE_DUP_BIGGER'); | |
dups++; | |
} else { | |
log(LOG_LEVEL.DEBUG,"SIZE TOO DIFFERENT", sourceMeta.FileSize - dupMeta.FileSize, sizeDiff, "SIZEDIFF:", `<${songKey}>`, dupMeta.filename, dupMeta.FileSize, dupMeta.File_Modified_Date, "SOURCE->", sourceMeta.filename, sourceMeta.FileSize, sourceMeta.File_Modified_Date); | |
} | |
} | |
}); | |
if (!dups) { | |
log(LOG_LEVEL.INFO,"unique encode date: <", songKey, ">",getTitle(dupSong), dupMeta.filename); | |
} | |
}); | |
} else { | |
const m = dupIndex.byEncodedDate[songKey][0]; | |
log(LOG_LEVEL.INFO,"NOENCODESOURCE size:<", songKey, ">", dupIndex.byEncodedDate[songKey].length, m.filename, m.OverallBitRate); | |
} | |
}); | |
} | |
const filterByKey = (filterKeys) => { | |
log(LOG_LEVEL.INFO,"Deleting items that match key"); | |
goodKeys = filterKeys.filter(filterKey => { | |
if (filterKey.length >=8) { | |
return true; | |
} | |
log(LOG_LEVEL.ERROR,`filter key: ${filterKey} is too small. Not using`); | |
return false; | |
}); | |
if (!goodKeys.length) { | |
log(LOG_LEVEL.WARN,"No keys, not filtering"); | |
return; | |
} | |
Object.keys(dupIndex.byTitle).filter(k => { | |
goodKeys.forEach(filterKey=> { | |
if (k.indexOf(filterKey) !== -1) { | |
dupIndex.byTitle[k].forEach(m => { | |
deleteItem(m, {}, k, 'SPECIFIED_KEY:'+k); | |
}); | |
} | |
}); | |
}); | |
} | |
// Different duplicate finding methods | |
// remove explicit keys | |
if (options['-match'] && options['-match'].length) { | |
filterByKey(options['-match']); | |
} | |
// match by album, title, artist | |
if (!options['-notitle']) { | |
findDupByTitle(); | |
} | |
else { | |
log(LOG_LEVEL.INFO,"-notitle flag, not doing title search") | |
} | |
// match by album, artist | |
if (options['-album']) { | |
log(LOG_LEVEL.INFO,"-album: Looking for duplicates without album"); | |
findDupByTitle('noAlbum'); | |
} | |
// match by album, artist | |
if (options['-track']) { | |
log(LOG_LEVEL.INFO,"-album: match by album artist and track number"); | |
findDupByTitle('noTrack'); | |
} | |
// find dups without regard for title where dup title is "VARIOUSARTISTS" | |
if (options['-various']) { | |
log(LOG_LEVEL.INFO,'-various: finding duplicates where "Various Artists" are looked for any artist'); | |
findDupByTitle('noArtist','VARIOUSARTISTS|VARIOUS', 'byTitle'); | |
} | |
// match by same size and bitrate | |
if (options['-size']) { | |
log(LOG_LEVEL.INFO,'-size: compare songs with same size'); | |
sizeCompare(); | |
} | |
// match by same encode date | |
if (options['-encode']) { | |
log(LOG_LEVEL.INFO,"-encode: compare songs with same encode date"); | |
encodeDateCompare(); | |
} | |
if (options['-clean']) { | |
log(LOG_LEVEL.INFO,"Cleaning out songs"); | |
cleanBadSongs(); | |
} | |
log(LOG_LEVEL.FATAL,`${deleteCount} files deleted (${attemptCount} attempted)`); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment