Skip to content

Instantly share code, notes, and snippets.

@WyattJia
Created December 8, 2017 11:53
Show Gist options
  • Save WyattJia/39f6ce8b811c253fd288f1ad0d7cbdad to your computer and use it in GitHub Desktop.
Save WyattJia/39f6ce8b811c253fd288f1ad0d7cbdad to your computer and use it in GitHub Desktop.
run in executor example
import re
from asyncio import get_event_loop, ensure_future, sleep
from collections import deque
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
HEADERS = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/55.0.2883.95 Safari/537.36'}
ascii_re = re.compile('[\u0000-\u007F]+')
space_re = re.compile('\s+')
counter = 0
done = set()
q = deque(maxlen=10000)
q.append('http://www.qq.com/')
done.add('http://www.qq.com/')
loop = get_event_loop()
thread_pool = ThreadPoolExecutor(30)
async def get_page(url):
def req():
try:
return requests.get(url, headers=HEADERS, timeout=5).text
except requests.RequestException:
return ""
except Exception as e:
print(e)
return ""
return await loop.run_in_executor(thread_pool, req)
async def job():
global counter
try:
url = q.popleft()
text = await get_page(url)
if text == '':
return
res = ''.join(ascii_re.findall(text))
res = space_re.sub('', res)
with open(f'dat/{counter}.txt', 'w', encoding='ascii') as f:
f.write(url)
f.write('::')
f.write(res[:60000])
counter += 1
soup = BeautifulSoup(text, 'html.parser')
i = 0
for link in soup.find_all('a'):
if i >= 40:
break
href = link.get('href')
if href is not None and (href.startswith('http://') or href.startswith('https://')) \
and href[:30] not in done:
i += 1
done.add(href[:30])
q.append(href)
print(f'OK {counter}th')
except Exception as e:
print(e)
async def main():
jobs = []
await job()
while q:
if counter >= 200_0000:
break
if len(jobs) < 30:
jobs.append(ensure_future(job()))
else:
jobs[:] = [i for i in jobs if not i.done()]
if len(jobs) < 30:
continue
else:
await sleep(1)
loop.run_until_complete(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment