Last active
April 27, 2018 09:35
-
-
Save lttzzlll/d1c9d76de8a0c99f44b627a55fc1f5bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
from concurrent import futures | |
import os | |
import time | |
from itertools import chain | |
from functools import wraps | |
def timeit(func): | |
@wraps(func) | |
def wrapper(*args, **kwargs): | |
start = time.time() | |
res = func(*args, **kwargs) | |
end = time.time() | |
print('function {} cost {}'.format(func.__name__, end - start)) | |
return res | |
return wrapper | |
def md5(fname): | |
hash_md5 = hashlib.md5() | |
with open(fname, "rb") as f: | |
map(hash_md5.update, iter(lambda: f.read(4096), b"")) | |
return hash_md5.hexdigest() | |
def compare_many(cmp_list): | |
cmp_list = list(chain(*cmp_list)) | |
with futures.ThreadPoolExecutor(max_workers=len(cmp_list)) as executor: | |
res = list(executor.map(md5, cmp_list)) | |
for i in range(0, len(res), 2): | |
print('A={}, B={}, A==B ? {}'.format( | |
res[i], res[i + 1], res[i] == res[i + 1])) | |
@timeit | |
def test(): | |
a, b = r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\tmp\fr-FR.ULM.allflavor.arpa', r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\20171218_allflavor\fr-FR.ULM.allflavor.arpa.rm' | |
c = [(a, b) for i in range(1000)] | |
compare_many(c) | |
if __name__ == '__main__': | |
test() | |
# function test cost 1.2680234909057617 |
多进程模式。
import hashlib
from concurrent import futures
import os
import time
from itertools import chain
from functools import wraps
def timeit(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time.time()
res = func(*args, **kwargs)
end = time.time()
print('function {} cost {}'.format(func.__name__, end - start))
return res
return wrapper
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
map(hash_md5.update, iter(lambda: f.read(4096), b""))
return hash_md5.hexdigest()
def compare_many(cmp_list):
cmp_list = list(chain(*cmp_list))
with futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
chunksize, extra = divmod(len(cmp_list), executor._max_workers * 4)
res = list(executor.map(md5, cmp_list, chunksize=chunksize))
for i in range(0, len(res), 2):
print('A={}, B={}, A==B ? {}'.format(
res[i], res[i + 1], res[i] == res[i + 1]))
@timeit
def test():
a, b = r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\tmp\fr-FR.ULM.allflavor.arpa', r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\20171218_allflavor\fr-FR.ULM.allflavor.arpa.rm'
c = [(a, b) for i in range(1000)]
compare_many(c)
if __name__ == '__main__':
test()
# function test cost 6.323782682418823
这居然不是最快的。chunksize=1才是最快的。
import hashlib
from concurrent import futures
import os
import time
from itertools import chain
from functools import wraps
def timeit(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time.time()
res = func(*args, **kwargs)
end = time.time()
print('function {} cost {}'.format(func.__name__, end - start))
return res
return wrapper
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
map(hash_md5.update, iter(lambda: f.read(4096), b""))
return hash_md5.hexdigest()
def compare_many(cmp_list):
cmp_list = list(chain(*cmp_list))
with futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
chunksize, extra = divmod(len(cmp_list), executor._max_workers * 4)
res = list(executor.map(md5, cmp_list, chunksize=1))
for i in range(0, len(res), 2):
print('A={}, B={}, A==B ? {}'.format(
res[i], res[i + 1], res[i] == res[i + 1]))
@timeit
def test():
a, b = r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\tmp\fr-FR.ULM.allflavor.arpa', r'\\ccpsofsep\am_s1\users\v-taotli\FRA\UnifiedModelBaselinePreparation\20171218_allflavor\fr-FR.ULM.allflavor.arpa.rm'
c = [(a, b) for i in range(1000)]
compare_many(c)
if __name__ == '__main__':
test()
# function test cost 2.953723430633545
如果将数量变成 10000,线程依然是最快的。进程会很慢。创建1w个线程,虽然一开始很多,但这些任务是一次性的,所以不要紧。
我的测试是不加 chunksize,反而执行的更快。哪里错了?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
如果在写程序的时候由于某种原因产生了大量的Python进程,可以通过下面的命令关闭除本Python Shell进程之外的其他所有进程。