lttzzlll/crawl_jianshu.py

## crawl_jianshu.py
import concurrent.futures
import urllib.request
import requests
import os

headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}


URLS = ['http://www.jianshu.com/',
        'http://www.163.com/',
        'http://www.baidu.com/',
        'http://www.qq.com/',
        'http://www.google.com/']

# Retrieve a single page and report the URL and contents


def load_url(url, timeout):
    try:
        req = requests.get(url, timeout=timeout, headers=headers)
        if req.status_code == 200:
            req.encoding = req.apparent_encoding
            return req.text
        else:
            print(url, req.status_code)
    except:
        pass
    return ''


def save(path, text):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(text)


# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print('%r page is %d length' % (url, len(data)))
            save(os.path.join(r'\\ccpsofsep\am_s1\users\v-taotli\LearnPython\tmp',
                              url.split('.')[1] + '.html'), data)
	import concurrent.futures
	import urllib.request
	import requests
	import os

	headers = {
	'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}


	URLS = ['http://www.jianshu.com/',
	'http://www.163.com/',
	'http://www.baidu.com/',
	'http://www.qq.com/',
	'http://www.google.com/']

	# Retrieve a single page and report the URL and contents


	def load_url(url, timeout):
	try:
	req = requests.get(url, timeout=timeout, headers=headers)
	if req.status_code == 200:
	req.encoding = req.apparent_encoding
	return req.text
	else:
	print(url, req.status_code)
	except:
	pass
	return ''


	def save(path, text):
	with open(path, 'w', encoding='utf-8') as f:
	f.write(text)


	# We can use a with statement to ensure threads are cleaned up promptly
	with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
	# Start the load operations and mark each future with its URL
	future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
	for future in concurrent.futures.as_completed(future_to_url):
	url = future_to_url[future]
	try:
	data = future.result()
	except Exception as exc:
	print('%r generated an exception: %s' % (url, exc))
	else:
	print('%r page is %d length' % (url, len(data)))
	save(os.path.join(r'\\ccpsofsep\am_s1\users\v-taotli\LearnPython\tmp',
	url.split('.')[1] + '.html'), data)