Skip to content

Instantly share code, notes, and snippets.

@lttzzlll
Last active April 27, 2018 09:35
Show Gist options
  • Save lttzzlll/d1c9d76de8a0c99f44b627a55fc1f5bf to your computer and use it in GitHub Desktop.
Save lttzzlll/d1c9d76de8a0c99f44b627a55fc1f5bf to your computer and use it in GitHub Desktop.
import hashlib
from concurrent import futures
import os
import time
from itertools import chain
from functools import wraps
def timeit(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time.time()
res = func(*args, **kwargs)
end = time.time()
print('function {} cost {}'.format(func.__name__, end - start))
return res
return wrapper
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
map(hash_md5.update, iter(lambda: f.read(4096), b""))
return hash_md5.hexdigest()
def compare_many(cmp_list):
cmp_list = list(chain(*cmp_list))
with futures.ThreadPoolExecutor(max_workers=len(cmp_list)) as executor:
res = list(executor.map(md5, cmp_list))
for i in range(0, len(res), 2):
print('A={}, B={}, A==B ? {}'.format(
res[i], res[i + 1], res[i] == res[i + 1]))
@timeit
def test():
a, b = r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\tmp\fr-FR.ULM.allflavor.arpa', r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\20171218_allflavor\fr-FR.ULM.allflavor.arpa.rm'
c = [(a, b) for i in range(1000)]
compare_many(c)
if __name__ == '__main__':
test()
# function test cost 1.2680234909057617
@lttzzlll
Copy link
Author

如果在写程序的时候由于某种原因产生了大量的Python进程,可以通过下面的命令关闭除本Python Shell进程之外的其他所有进程。

import os
os.system("taskkill /f /im  Python.exe")

@lttzzlll
Copy link
Author

多进程模式。

import hashlib
from concurrent import futures
import os
import time
from itertools import chain
from functools import wraps


def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        res = func(*args, **kwargs)
        end = time.time()
        print('function {} cost {}'.format(func.__name__, end - start))
        return res
    return wrapper


def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        map(hash_md5.update, iter(lambda: f.read(4096), b""))
    return hash_md5.hexdigest()


def compare_many(cmp_list):

    cmp_list = list(chain(*cmp_list))

    with futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
        chunksize, extra = divmod(len(cmp_list), executor._max_workers * 4)
        res = list(executor.map(md5, cmp_list, chunksize=chunksize))

        for i in range(0, len(res), 2):
            print('A={}, B={}, A==B ? {}'.format(
                res[i], res[i + 1], res[i] == res[i + 1]))


@timeit
def test():
    a, b = r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\tmp\fr-FR.ULM.allflavor.arpa',  r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\20171218_allflavor\fr-FR.ULM.allflavor.arpa.rm'
    c = [(a, b) for i in range(1000)]
    compare_many(c)


if __name__ == '__main__':
    test()
# function test cost 6.323782682418823

@lttzzlll
Copy link
Author

这居然不是最快的。chunksize=1才是最快的。

import hashlib
from concurrent import futures
import os
import time
from itertools import chain
from functools import wraps


def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        res = func(*args, **kwargs)
        end = time.time()
        print('function {} cost {}'.format(func.__name__, end - start))
        return res
    return wrapper


def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        map(hash_md5.update, iter(lambda: f.read(4096), b""))
    return hash_md5.hexdigest()


def compare_many(cmp_list):

    cmp_list = list(chain(*cmp_list))

    with futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
        chunksize, extra = divmod(len(cmp_list), executor._max_workers * 4)
        res = list(executor.map(md5, cmp_list, chunksize=1))

        for i in range(0, len(res), 2):
            print('A={}, B={}, A==B ? {}'.format(
                res[i], res[i + 1], res[i] == res[i + 1]))


@timeit
def test():
    a, b = r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\tmp\fr-FR.ULM.allflavor.arpa',  r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\20171218_allflavor\fr-FR.ULM.allflavor.arpa.rm'
    c = [(a, b) for i in range(1000)]
    compare_many(c)


if __name__ == '__main__':
    test()
# function test cost 2.953723430633545

@lttzzlll
Copy link
Author

如果将数量变成 10000,线程依然是最快的。进程会很慢。创建1w个线程,虽然一开始很多,但这些任务是一次性的,所以不要紧。

@lttzzlll
Copy link
Author

我的测试是不加 chunksize,反而执行的更快。哪里错了?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment