allenaven/flatten_ebooks_dir.py

## flatten_ebooks_dir.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 12 19:09:58 2017

@author: aa
"""
import os
import shutil

def recurse_filenames(directory):
    """Argument is one directory name as string input. Directory name can
    be a full path or relative to the working directory.

    Recursively finds all filenames (including path) the from specified
    directory and all of its subfolders. Returns a generator.
    """
    try:
        with os.scandir(directory) as it:
            for entry in it:
                if entry.is_file():
                    yield entry.path
                else:
                    yield from recurse_filenames(entry.path)
    except Exception as e:
        print('Error!', e.args)
        yield e.args

def get_filenames(directory):
    """Uses `recurse_filenames(directory)` to list all files in dir & subdirs,
    returns list of filepaths filtered to include only .pdf and .epub files.
    """
    keepers = [file for file in recurse_filenames(directory)
                if (".epub" in file) or (".pdf" in file)]
    return keepers

def del_empty_subdirs(dirbase):
    """Input the base directory name. Remove all the branches that don't
    contain any epub or pdf files.
    """
    tree = os.walk('./')
    for t in tree:
        if t[0] is not './' and len(t[2]) > 0:
            clean = True
            for f in t[2]:
                if (".epub" in f) or (".pdf" in f):
                    clean = False
            if clean:
                print('Removing:', t[0], '\n', t[2], '\n')
                shutil.rmtree(t[1])
            else:
                print('Cannot remove:', t[0], '\n', t[2], '\n')
        else:
            if t[0] is not './':
                shutil.rmtree(t[0])

def move_book_files(dirbase, keepers):
    """Moves all "book" files to the root
    """
    for file in keepers:
        if os.path.dirname(file) != dirbase:
            newfname = os.path.basename(file).replace(' ', '_')
            newfpath = os.path.join(dirbase, newfname)
            shutil.move(file, newfpath)
        else:
            # if the file IS in the base directory, do nothing
            pass

## Body
wd = '/home/aa/Books_local'
os.chdir(wd)
keepers = get_filenames(wd)
move_book_files(wd, keepers)
del_empty_subdirs(wd)

### ### ###
# Check for dupes with hashes
import hashlib
def chunker(file):
    with open(file, 'rb') as openfile:
        start = 0
        fsize = os.path.getsize(file)
        while True:
            chunk = openfile.read(4096)
            yield chunk
            start += 4096
            if start > fsize:
                break

#files = os.listdir()
files = [_ for _ in recurse_filenames(wd)]
hashes = []
for f in files:
    h = hashlib.md5()
    for chunk in chunker(f):
        h.update(chunk)
    hashes.append(h.hexdigest())

hashzip = zip(files, hashes)
hashset = set(hashes)
hash_valuecounts = [(_, hashes.count(_)) for _ in hashset if hashes.count(_) != 1]

if len(hash_valuecounts) > 0:
    for _, __ in hashzip:
        for i, j in enumerate(hash_valuecounts):
            if __ == hash_valuecounts[i][0]:
                print('Duplicate file:', _)
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Fri May 12 19:09:58 2017

	@author: aa
	"""
	import os
	import shutil

	def recurse_filenames(directory):
	"""Argument is one directory name as string input. Directory name can
	be a full path or relative to the working directory.

	Recursively finds all filenames (including path) the from specified
	directory and all of its subfolders. Returns a generator.
	"""
	try:
	with os.scandir(directory) as it:
	for entry in it:
	if entry.is_file():
	yield entry.path
	else:
	yield from recurse_filenames(entry.path)
	except Exception as e:
	print('Error!', e.args)
	yield e.args

	def get_filenames(directory):
	"""Uses `recurse_filenames(directory)` to list all files in dir & subdirs,
	returns list of filepaths filtered to include only .pdf and .epub files.
	"""
	keepers = [file for file in recurse_filenames(directory)
	if (".epub" in file) or (".pdf" in file)]
	return keepers

	def del_empty_subdirs(dirbase):
	"""Input the base directory name. Remove all the branches that don't
	contain any epub or pdf files.
	"""
	tree = os.walk('./')
	for t in tree:
	if t[0] is not './' and len(t[2]) > 0:
	clean = True
	for f in t[2]:
	if (".epub" in f) or (".pdf" in f):
	clean = False
	if clean:
	print('Removing:', t[0], '\n', t[2], '\n')
	shutil.rmtree(t[1])
	else:
	print('Cannot remove:', t[0], '\n', t[2], '\n')
	else:
	if t[0] is not './':
	shutil.rmtree(t[0])

	def move_book_files(dirbase, keepers):
	"""Moves all "book" files to the root
	"""
	for file in keepers:
	if os.path.dirname(file) != dirbase:
	newfname = os.path.basename(file).replace(' ', '_')
	newfpath = os.path.join(dirbase, newfname)
	shutil.move(file, newfpath)
	else:
	# if the file IS in the base directory, do nothing
	pass

	## Body
	wd = '/home/aa/Books_local'
	os.chdir(wd)
	keepers = get_filenames(wd)
	move_book_files(wd, keepers)
	del_empty_subdirs(wd)

	### ### ###
	# Check for dupes with hashes
	import hashlib
	def chunker(file):
	with open(file, 'rb') as openfile:
	start = 0
	fsize = os.path.getsize(file)
	while True:
	chunk = openfile.read(4096)
	yield chunk
	start += 4096
	if start > fsize:
	break

	#files = os.listdir()
	files = [_ for _ in recurse_filenames(wd)]
	hashes = []
	for f in files:
	h = hashlib.md5()
	for chunk in chunker(f):
	h.update(chunk)
	hashes.append(h.hexdigest())

	hashzip = zip(files, hashes)
	hashset = set(hashes)
	hash_valuecounts = [(_, hashes.count(_)) for _ in hashset if hashes.count(_) != 1]

	if len(hash_valuecounts) > 0:
	for _, __ in hashzip:
	for i, j in enumerate(hash_valuecounts):
	if __ == hash_valuecounts[i][0]:
	print('Duplicate file:', _)