lttzzlll/concurrent.py

## concurrent.py
import concurrent.futures
import urllib.request
import requests
import os

headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}


URLS = ['http://www.jianshu.com/',
        'http://www.163.com/',
        'http://www.baidu.com/',
        'http://www.qq.com/',
        'http://www.google.com/',
        'http://www.xiaomi.com',
        'http://taobao.com',
        'http://tmall.com',
        'http://jd.com',
        'http://apple.com',
        'http://bing.com']

# Retrieve a single page and report the URL and contents


def load_url(url, timeout=30):
    try:
        req = requests.get(url, timeout=timeout, headers=headers)
        if req.status_code == 200:
            req.encoding = req.apparent_encoding
            return req.text
        else:
            print(url, req.status_code)
    except:
        pass
    return ''


def save(path, text):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(text)


# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        print(url, len(future.result()))


    # for url, txt in zip(URLS, executor.map(load_url, URLS)):
    #     print(url, len(txt))

    #     try:
    #         data = future.result()
    #     except Exception as exc:
    #         print('%r generated an exception: %s' % (url, exc))
    #     else:
    #         print('%r page is %d length' % (url, len(data)))
    #         save(os.path.join(r'\\ccpsofsep\am_s1\users\v-taotli\LearnPython\tmp',
    #                           url.split('.')[1] + '.html'), data)
	import concurrent.futures
	import urllib.request
	import requests
	import os

	headers = {
	'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}


	URLS = ['http://www.jianshu.com/',
	'http://www.163.com/',
	'http://www.baidu.com/',
	'http://www.qq.com/',
	'http://www.google.com/',
	'http://www.xiaomi.com',
	'http://taobao.com',
	'http://tmall.com',
	'http://jd.com',
	'http://apple.com',
	'http://bing.com']

	# Retrieve a single page and report the URL and contents


	def load_url(url, timeout=30):
	try:
	req = requests.get(url, timeout=timeout, headers=headers)
	if req.status_code == 200:
	req.encoding = req.apparent_encoding
	return req.text
	else:
	print(url, req.status_code)
	except:
	pass
	return ''


	def save(path, text):
	with open(path, 'w', encoding='utf-8') as f:
	f.write(text)


	# We can use a with statement to ensure threads are cleaned up promptly
	with concurrent.futures.ThreadPoolExecutor() as executor:
	# Start the load operations and mark each future with its URL
	future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
	for future in concurrent.futures.as_completed(future_to_url):
	url = future_to_url[future]
	print(url, len(future.result()))


	# for url, txt in zip(URLS, executor.map(load_url, URLS)):
	# print(url, len(txt))

	# try:
	# data = future.result()
	# except Exception as exc:
	# print('%r generated an exception: %s' % (url, exc))
	# else:
	# print('%r page is %d length' % (url, len(data)))
	# save(os.path.join(r'\\ccpsofsep\am_s1\users\v-taotli\LearnPython\tmp',
	# url.split('.')[1] + '.html'), data)