Last active
August 29, 2015 14:05
-
-
Save minhoryang/ec65bbd8b50d74417d7b to your computer and use it in GitHub Desktop.
(Snippet) Divide the unknown encoding files to its encoding-named directory (with multiprocessing.Pool).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!env python3 | |
"""Divide the unknown encoding files to its encoding-named directory.""" | |
from functools import lru_cache | |
import os | |
from chardet.universaldetector import UniversalDetector | |
def detect_encoding(path): | |
"""Using external module 'chardet', detect the encoding and return.""" | |
dtxor = UniversalDetector() | |
for line in open(path, 'rb'): | |
dtxor.feed(line) | |
dtxor.close() | |
return dtxor.result['encoding'] if dtxor.result['encoding'] else 'None' | |
@lru_cache(maxsize=None) | |
def init_destination(base, encoding): | |
"""Initialize the target path which will store files. | |
With lru_cache, mkdir will call only 1 time. | |
And it's okay to set maxsize as None because of limited types of encoding. | |
""" | |
target = os.path.join(base, encoding) | |
if not os.path.isdir(target): | |
try: | |
os.mkdir(target) | |
except OSError: | |
pass # expects 'already exists the directory'. | |
return target | |
if __name__ == "__main__": | |
import multiprocessing | |
import shutil | |
import sys | |
try: | |
from tqdm import tqdm | |
except ImportError: | |
tqdm = lambda *i, **kwargs: i[0] # pylint: disable=invalid-name | |
queued = [] | |
for _base in sys.argv[1:]: | |
for _filename in tqdm(os.listdir(_base), leave=True): | |
queued.append((_base, _filename)) | |
def logic(packed): | |
"""(multiprocessing) Move it to its encoding-named directory.""" | |
base, filename = packed | |
origin = os.path.join(base, filename) | |
if os.path.isfile(origin): | |
shutil.move(origin, | |
init_destination(base, detect_encoding(origin))) | |
with multiprocessing.Pool(multiprocessing.cpu_count()-1) as pool: | |
pool.imap_unordered(logic, tqdm(queued, leave=True)) | |
pool.close() | |
pool.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment