Created
June 25, 2017 10:51
-
-
Save allenaven/dc1faf71f3d3deb719c38a237e13eea5 to your computer and use it in GitHub Desktop.
Moves ebook files from complicated directory structure to all one folder, deletes excess metadata files and dir paths
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Fri May 12 19:09:58 2017 | |
@author: aa | |
""" | |
import os | |
import shutil | |
def recurse_filenames(directory): | |
"""Argument is one directory name as string input. Directory name can | |
be a full path or relative to the working directory. | |
Recursively finds all filenames (including path) the from specified | |
directory and all of its subfolders. Returns a generator. | |
""" | |
try: | |
with os.scandir(directory) as it: | |
for entry in it: | |
if entry.is_file(): | |
yield entry.path | |
else: | |
yield from recurse_filenames(entry.path) | |
except Exception as e: | |
print('Error!', e.args) | |
yield e.args | |
def get_filenames(directory): | |
"""Uses `recurse_filenames(directory)` to list all files in dir & subdirs, | |
returns list of filepaths filtered to include only .pdf and .epub files. | |
""" | |
keepers = [file for file in recurse_filenames(directory) | |
if (".epub" in file) or (".pdf" in file)] | |
return keepers | |
def del_empty_subdirs(dirbase): | |
"""Input the base directory name. Remove all the branches that don't | |
contain any epub or pdf files. | |
""" | |
tree = os.walk('./') | |
for t in tree: | |
if t[0] is not './' and len(t[2]) > 0: | |
clean = True | |
for f in t[2]: | |
if (".epub" in f) or (".pdf" in f): | |
clean = False | |
if clean: | |
print('Removing:', t[0], '\n', t[2], '\n') | |
shutil.rmtree(t[1]) | |
else: | |
print('Cannot remove:', t[0], '\n', t[2], '\n') | |
else: | |
if t[0] is not './': | |
shutil.rmtree(t[0]) | |
def move_book_files(dirbase, keepers): | |
"""Moves all "book" files to the root | |
""" | |
for file in keepers: | |
if os.path.dirname(file) != dirbase: | |
newfname = os.path.basename(file).replace(' ', '_') | |
newfpath = os.path.join(dirbase, newfname) | |
shutil.move(file, newfpath) | |
else: | |
# if the file IS in the base directory, do nothing | |
pass | |
## Body | |
wd = '/home/aa/Books_local' | |
os.chdir(wd) | |
keepers = get_filenames(wd) | |
move_book_files(wd, keepers) | |
del_empty_subdirs(wd) | |
### ### ### | |
# Check for dupes with hashes | |
import hashlib | |
def chunker(file): | |
with open(file, 'rb') as openfile: | |
start = 0 | |
fsize = os.path.getsize(file) | |
while True: | |
chunk = openfile.read(4096) | |
yield chunk | |
start += 4096 | |
if start > fsize: | |
break | |
#files = os.listdir() | |
files = [_ for _ in recurse_filenames(wd)] | |
hashes = [] | |
for f in files: | |
h = hashlib.md5() | |
for chunk in chunker(f): | |
h.update(chunk) | |
hashes.append(h.hexdigest()) | |
hashzip = zip(files, hashes) | |
hashset = set(hashes) | |
hash_valuecounts = [(_, hashes.count(_)) for _ in hashset if hashes.count(_) != 1] | |
if len(hash_valuecounts) > 0: | |
for _, __ in hashzip: | |
for i, j in enumerate(hash_valuecounts): | |
if __ == hash_valuecounts[i][0]: | |
print('Duplicate file:', _) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment