Skip to content

Instantly share code, notes, and snippets.

@linw1995
Created July 18, 2019 13:07
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save linw1995/34b1777d81a47f519606502cb88fef66 to your computer and use it in GitHub Desktop.
Save linw1995/34b1777d81a47f519606502cb88fef66 to your computer and use it in GitHub Desktop.
用多线程加速爬虫做 lxml 解析 | 源码
import asyncio
import time
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from lxml import html
_html_text = None
_latest_fetched = 0
async def fetch_text():
# 模拟爬取
await asyncio.sleep(0.01)
# 记录最后的一个请求的结束时间
global _latest_fetched
_latest_fetched = time.perf_counter()
return _html_text
def get_title(text):
doc = html.fromstring(text)
return doc.xpath("//title/text()")[0]
async def create_task(loop, executor):
text = await fetch_text()
# 把解析任务交给进程池
title = await loop.run_in_executor(executor, get_title, text)
return title
async def main():
loop = asyncio.get_event_loop()
executor = ProcessPoolExecutor()
futs = []
for _ in range(2000):
fut = asyncio.ensure_future(create_task(loop, executor))
futs.append(fut)
await asyncio.gather(*futs)
if __name__ == "__main__":
_html_text = Path("example.html").read_text()
t_start = time.perf_counter()
asyncio.run(main())
t_end = time.perf_counter()
print("total:", (t_end - t_start))
print("latest fetched:", _latest_fetched - t_start)
import asyncio
import time
from pathlib import Path
from lxml import html
_html_text = None
_latest_fetched = 0
async def fetch_text():
# 模拟爬取
await asyncio.sleep(0.01)
# 记录最后的一个请求的结束时间
global _latest_fetched
_latest_fetched = time.perf_counter()
return _html_text
def get_title(text):
doc = html.fromstring(text)
return doc.xpath("//title/text()")[0]
async def create_task(loop):
text = await fetch_text()
# 把解析任务交给线程池
title = await loop.run_in_executor(None, get_title, text)
return title
async def main():
loop = asyncio.get_event_loop()
# 可以用 loop.set_default_executor 来配置默认的线程池
futs = []
for _ in range(2000):
fut = asyncio.ensure_future(create_task(loop))
futs.append(fut)
await asyncio.gather(*futs)
if __name__ == "__main__":
_html_text = Path("example.html").read_text()
t_start = time.perf_counter()
asyncio.run(main())
t_end = time.perf_counter()
print("total:", (t_end - t_start))
print("latest fetched:", _latest_fetched - t_start)
import asyncio
import threading
import time
from pathlib import Path
from lxml import html
_html_text = None
_latest_fetched = 0
_local = threading.local()
async def fetch_text():
# 模拟爬取
await asyncio.sleep(0.01)
# 记录最后的一个请求的结束时间
global _latest_fetched
_latest_fetched = time.perf_counter()
return _html_text
def get_title(text):
# 通过为各个线程,创建属于自己的解析器
# 这样当解析时,线程就会释放 GIL
if not hasattr(_local, "parser"):
_local.parser = html.HTMLParser()
doc = html.fromstring(text, parser=_local.parser)
return doc.xpath("//title/text()")[0]
async def create_task(loop):
text = await fetch_text()
# 把解析任务交给线程池
title = await loop.run_in_executor(None, get_title, text)
return title
async def main():
loop = asyncio.get_event_loop()
# 可以用 loop.set_default_executor 来配置默认的线程池
futs = []
for _ in range(2000):
fut = asyncio.ensure_future(create_task(loop))
futs.append(fut)
await asyncio.gather(*futs)
if __name__ == "__main__":
_html_text = Path("example.html").read_text()
t_start = time.perf_counter()
asyncio.run(main())
t_end = time.perf_counter()
print("total:", (t_end - t_start))
print("latest fetched:", _latest_fetched - t_start)
import asyncio
import time
from pathlib import Path
from lxml import html
_html_text = None
_latest_fetched = 0
async def fetch_text():
# 模拟爬取
await asyncio.sleep(0.01)
# 记录最后的一个请求的结束时间
global _latest_fetched
_latest_fetched = time.perf_counter()
return _html_text
def get_title(text):
doc = html.fromstring(text)
return doc.xpath("//title/text()")[0]
async def create_task():
text = await fetch_text()
title = get_title(text)
return title
async def main():
futs = []
for _ in range(2000):
fut = asyncio.ensure_future(create_task())
futs.append(fut)
await asyncio.gather(*futs)
if __name__ == "__main__":
_html_text = Path("example.html").read_text()
t_start = time.perf_counter()
asyncio.run(main())
t_end = time.perf_counter()
print("total:", (t_end - t_start))
print("latest fetched:", _latest_fetched - t_start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment