Skip to content

Instantly share code, notes, and snippets.

@allenaven
Created June 25, 2017 10:51
Show Gist options
  • Save allenaven/dc1faf71f3d3deb719c38a237e13eea5 to your computer and use it in GitHub Desktop.
Save allenaven/dc1faf71f3d3deb719c38a237e13eea5 to your computer and use it in GitHub Desktop.
Moves ebook files from complicated directory structure to all one folder, deletes excess metadata files and dir paths
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 12 19:09:58 2017
@author: aa
"""
import os
import shutil
def recurse_filenames(directory):
"""Argument is one directory name as string input. Directory name can
be a full path or relative to the working directory.
Recursively finds all filenames (including path) the from specified
directory and all of its subfolders. Returns a generator.
"""
try:
with os.scandir(directory) as it:
for entry in it:
if entry.is_file():
yield entry.path
else:
yield from recurse_filenames(entry.path)
except Exception as e:
print('Error!', e.args)
yield e.args
def get_filenames(directory):
"""Uses `recurse_filenames(directory)` to list all files in dir & subdirs,
returns list of filepaths filtered to include only .pdf and .epub files.
"""
keepers = [file for file in recurse_filenames(directory)
if (".epub" in file) or (".pdf" in file)]
return keepers
def del_empty_subdirs(dirbase):
"""Input the base directory name. Remove all the branches that don't
contain any epub or pdf files.
"""
tree = os.walk('./')
for t in tree:
if t[0] is not './' and len(t[2]) > 0:
clean = True
for f in t[2]:
if (".epub" in f) or (".pdf" in f):
clean = False
if clean:
print('Removing:', t[0], '\n', t[2], '\n')
shutil.rmtree(t[1])
else:
print('Cannot remove:', t[0], '\n', t[2], '\n')
else:
if t[0] is not './':
shutil.rmtree(t[0])
def move_book_files(dirbase, keepers):
"""Moves all "book" files to the root
"""
for file in keepers:
if os.path.dirname(file) != dirbase:
newfname = os.path.basename(file).replace(' ', '_')
newfpath = os.path.join(dirbase, newfname)
shutil.move(file, newfpath)
else:
# if the file IS in the base directory, do nothing
pass
## Body
wd = '/home/aa/Books_local'
os.chdir(wd)
keepers = get_filenames(wd)
move_book_files(wd, keepers)
del_empty_subdirs(wd)
### ### ###
# Check for dupes with hashes
import hashlib
def chunker(file):
with open(file, 'rb') as openfile:
start = 0
fsize = os.path.getsize(file)
while True:
chunk = openfile.read(4096)
yield chunk
start += 4096
if start > fsize:
break
#files = os.listdir()
files = [_ for _ in recurse_filenames(wd)]
hashes = []
for f in files:
h = hashlib.md5()
for chunk in chunker(f):
h.update(chunk)
hashes.append(h.hexdigest())
hashzip = zip(files, hashes)
hashset = set(hashes)
hash_valuecounts = [(_, hashes.count(_)) for _ in hashset if hashes.count(_) != 1]
if len(hash_valuecounts) > 0:
for _, __ in hashzip:
for i, j in enumerate(hash_valuecounts):
if __ == hash_valuecounts[i][0]:
print('Duplicate file:', _)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment