Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
(Snippet) Divide the unknown encoding files to its encoding-named directory (with multiprocessing.Pool).
#!env python3
"""Divide the unknown encoding files to its encoding-named directory."""
from functools import lru_cache
import os
from chardet.universaldetector import UniversalDetector
def detect_encoding(path):
"""Using external module 'chardet', detect the encoding and return."""
dtxor = UniversalDetector()
for line in open(path, 'rb'):
return dtxor.result['encoding'] if dtxor.result['encoding'] else 'None'
def init_destination(base, encoding):
"""Initialize the target path which will store files.
With lru_cache, mkdir will call only 1 time.
And it's okay to set maxsize as None because of limited types of encoding.
target = os.path.join(base, encoding)
if not os.path.isdir(target):
except OSError:
pass # expects 'already exists the directory'.
return target
if __name__ == "__main__":
import multiprocessing
import shutil
import sys
from tqdm import tqdm
except ImportError:
tqdm = lambda *i, **kwargs: i[0] # pylint: disable=invalid-name
queued = []
for _base in sys.argv[1:]:
for _filename in tqdm(os.listdir(_base), leave=True):
queued.append((_base, _filename))
def logic(packed):
"""(multiprocessing) Move it to its encoding-named directory."""
base, filename = packed
origin = os.path.join(base, filename)
if os.path.isfile(origin):
init_destination(base, detect_encoding(origin)))
with multiprocessing.Pool(multiprocessing.cpu_count()-1) as pool:
pool.imap_unordered(logic, tqdm(queued, leave=True))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment