Skip to content

Instantly share code, notes, and snippets.

@minhoryang
Last active August 29, 2015 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save minhoryang/ec65bbd8b50d74417d7b to your computer and use it in GitHub Desktop.
Save minhoryang/ec65bbd8b50d74417d7b to your computer and use it in GitHub Desktop.
(Snippet) Divide the unknown encoding files to its encoding-named directory (with multiprocessing.Pool).
#!env python3
"""Divide the unknown encoding files to its encoding-named directory."""
from functools import lru_cache
import os
from chardet.universaldetector import UniversalDetector
def detect_encoding(path):
"""Using external module 'chardet', detect the encoding and return."""
dtxor = UniversalDetector()
for line in open(path, 'rb'):
dtxor.feed(line)
dtxor.close()
return dtxor.result['encoding'] if dtxor.result['encoding'] else 'None'
@lru_cache(maxsize=None)
def init_destination(base, encoding):
"""Initialize the target path which will store files.
With lru_cache, mkdir will call only 1 time.
And it's okay to set maxsize as None because of limited types of encoding.
"""
target = os.path.join(base, encoding)
if not os.path.isdir(target):
try:
os.mkdir(target)
except OSError:
pass # expects 'already exists the directory'.
return target
if __name__ == "__main__":
import multiprocessing
import shutil
import sys
try:
from tqdm import tqdm
except ImportError:
tqdm = lambda *i, **kwargs: i[0] # pylint: disable=invalid-name
queued = []
for _base in sys.argv[1:]:
for _filename in tqdm(os.listdir(_base), leave=True):
queued.append((_base, _filename))
def logic(packed):
"""(multiprocessing) Move it to its encoding-named directory."""
base, filename = packed
origin = os.path.join(base, filename)
if os.path.isfile(origin):
shutil.move(origin,
init_destination(base, detect_encoding(origin)))
with multiprocessing.Pool(multiprocessing.cpu_count()-1) as pool:
pool.imap_unordered(logic, tqdm(queued, leave=True))
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment