Skip to content

Instantly share code, notes, and snippets.

@piyh
Created August 19, 2020 02:32
Show Gist options
  • Save piyh/ad2c45091228f71ee18a1ed707145639 to your computer and use it in GitHub Desktop.
Save piyh/ad2c45091228f71ee18a1ed707145639 to your computer and use it in GitHub Desktop.
#sometimes the "best" format that youtube-dl selects changes.
#This causes a second file of the same vid to download if you're archiving entire channels.
#When that happens, this script will clear out the duplicates if the youtube id was included in the filename.
import os
import pandas as pd
from pathlib import Path
from datetime import datetime
from pprint import pprint as pp
def returnDupes(li):
seen = set()
dupe = set()
for ele in li:
if ele not in seen:
seen.add(ele)
else:
dupe.add(ele)
return dupe
wd = Path('D:/kingcobrajfs/')
os.chdir(wd)
videos = []
ids = []
for file in wd.iterdir():
ext = file.suffix
lenSuffixes = sum([len(x) for x in file.suffixes])
if ext in ('.mkv', '.webm', '.mp4'):
ytID = file.name[len(file.name)-lenSuffixes-11:-lenSuffixes] #11 is youtube id len
ids.append(ytID)
videos.append({'ytID':ytID #could merge the dupe dictValue append logic with this, but performance doens't matter and I don't want to rewrite
, 'path':file
, 'size':file.stat().st_size
, 'mtime':file.stat().st_mtime})
dupeSet = returnDupes(ids)
dupeDict = {}
for file in videos:
if file['ytID'] in dupeSet:
ytID = file['ytID']
appendValues = {'path':file['path'], 'mtime':file['mtime']}
dictValue = dupeDict.get(ytID)
if dictValue:
dictValue.append(appendValues)
dictValue.sort(key = lambda x: x.get('mtime'))
deleting = dictValue[0]
saving = dictValue[1]
print(f"deleting {deleting} as it's older than {saving}")
dictValue[0]['path'].unlink()
del dictValue[0]
else:
dupeDict[ytID] = [appendValues]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment