Created
July 18, 2019 13:07
-
-
Save linw1995/34b1777d81a47f519606502cb88fef66 to your computer and use it in GitHub Desktop.
用多线程加速爬虫做 lxml 解析 | 源码
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import time | |
from concurrent.futures import ProcessPoolExecutor | |
from pathlib import Path | |
from lxml import html | |
_html_text = None | |
_latest_fetched = 0 | |
async def fetch_text(): | |
# 模拟爬取 | |
await asyncio.sleep(0.01) | |
# 记录最后的一个请求的结束时间 | |
global _latest_fetched | |
_latest_fetched = time.perf_counter() | |
return _html_text | |
def get_title(text): | |
doc = html.fromstring(text) | |
return doc.xpath("//title/text()")[0] | |
async def create_task(loop, executor): | |
text = await fetch_text() | |
# 把解析任务交给进程池 | |
title = await loop.run_in_executor(executor, get_title, text) | |
return title | |
async def main(): | |
loop = asyncio.get_event_loop() | |
executor = ProcessPoolExecutor() | |
futs = [] | |
for _ in range(2000): | |
fut = asyncio.ensure_future(create_task(loop, executor)) | |
futs.append(fut) | |
await asyncio.gather(*futs) | |
if __name__ == "__main__": | |
_html_text = Path("example.html").read_text() | |
t_start = time.perf_counter() | |
asyncio.run(main()) | |
t_end = time.perf_counter() | |
print("total:", (t_end - t_start)) | |
print("latest fetched:", _latest_fetched - t_start) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import time | |
from pathlib import Path | |
from lxml import html | |
_html_text = None | |
_latest_fetched = 0 | |
async def fetch_text(): | |
# 模拟爬取 | |
await asyncio.sleep(0.01) | |
# 记录最后的一个请求的结束时间 | |
global _latest_fetched | |
_latest_fetched = time.perf_counter() | |
return _html_text | |
def get_title(text): | |
doc = html.fromstring(text) | |
return doc.xpath("//title/text()")[0] | |
async def create_task(loop): | |
text = await fetch_text() | |
# 把解析任务交给线程池 | |
title = await loop.run_in_executor(None, get_title, text) | |
return title | |
async def main(): | |
loop = asyncio.get_event_loop() | |
# 可以用 loop.set_default_executor 来配置默认的线程池 | |
futs = [] | |
for _ in range(2000): | |
fut = asyncio.ensure_future(create_task(loop)) | |
futs.append(fut) | |
await asyncio.gather(*futs) | |
if __name__ == "__main__": | |
_html_text = Path("example.html").read_text() | |
t_start = time.perf_counter() | |
asyncio.run(main()) | |
t_end = time.perf_counter() | |
print("total:", (t_end - t_start)) | |
print("latest fetched:", _latest_fetched - t_start) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import threading | |
import time | |
from pathlib import Path | |
from lxml import html | |
_html_text = None | |
_latest_fetched = 0 | |
_local = threading.local() | |
async def fetch_text(): | |
# 模拟爬取 | |
await asyncio.sleep(0.01) | |
# 记录最后的一个请求的结束时间 | |
global _latest_fetched | |
_latest_fetched = time.perf_counter() | |
return _html_text | |
def get_title(text): | |
# 通过为各个线程,创建属于自己的解析器 | |
# 这样当解析时,线程就会释放 GIL | |
if not hasattr(_local, "parser"): | |
_local.parser = html.HTMLParser() | |
doc = html.fromstring(text, parser=_local.parser) | |
return doc.xpath("//title/text()")[0] | |
async def create_task(loop): | |
text = await fetch_text() | |
# 把解析任务交给线程池 | |
title = await loop.run_in_executor(None, get_title, text) | |
return title | |
async def main(): | |
loop = asyncio.get_event_loop() | |
# 可以用 loop.set_default_executor 来配置默认的线程池 | |
futs = [] | |
for _ in range(2000): | |
fut = asyncio.ensure_future(create_task(loop)) | |
futs.append(fut) | |
await asyncio.gather(*futs) | |
if __name__ == "__main__": | |
_html_text = Path("example.html").read_text() | |
t_start = time.perf_counter() | |
asyncio.run(main()) | |
t_end = time.perf_counter() | |
print("total:", (t_end - t_start)) | |
print("latest fetched:", _latest_fetched - t_start) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import time | |
from pathlib import Path | |
from lxml import html | |
_html_text = None | |
_latest_fetched = 0 | |
async def fetch_text(): | |
# 模拟爬取 | |
await asyncio.sleep(0.01) | |
# 记录最后的一个请求的结束时间 | |
global _latest_fetched | |
_latest_fetched = time.perf_counter() | |
return _html_text | |
def get_title(text): | |
doc = html.fromstring(text) | |
return doc.xpath("//title/text()")[0] | |
async def create_task(): | |
text = await fetch_text() | |
title = get_title(text) | |
return title | |
async def main(): | |
futs = [] | |
for _ in range(2000): | |
fut = asyncio.ensure_future(create_task()) | |
futs.append(fut) | |
await asyncio.gather(*futs) | |
if __name__ == "__main__": | |
_html_text = Path("example.html").read_text() | |
t_start = time.perf_counter() | |
asyncio.run(main()) | |
t_end = time.perf_counter() | |
print("total:", (t_end - t_start)) | |
print("latest fetched:", _latest_fetched - t_start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment