Skip to content

Instantly share code, notes, and snippets.

@swshan
Created October 21, 2015 08:57
Show Gist options
  • Save swshan/4b7562d09a64da59e1a3 to your computer and use it in GitHub Desktop.
Save swshan/4b7562d09a64da59e1a3 to your computer and use it in GitHub Desktop.
#-*- coding:utf-8 -*-
import sys
import re
import requests
import gevent
from bs4 import BeautifulSoup
import urlparse
import time
import json
reload(sys) # reload 才能调用 setdefaultencoding 方法
sys.setdefaultencoding('utf-8') # 设置 'utf-8'
global header_info
header_info = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'
}
# root_url = 'http://wap.douban.com'
def crawler(list_url):
''' '''
lele_url = "http://m.leleketang.com/lib/%s.shtml" % list_url
r = requests.get(lele_url, timeout = 0.2, headers=header_info)
print lele_url
print "http status_code is " + str(r.status_code)
''' soup extract html '''
soup = BeautifulSoup(r.text, 'html.parser',from_encoding="utf-8" )
question = soup.findAll('div', attrs={'class':"uc_q"})
print "beautifulsoup " + str(soup.original_encoding)
raw_answer = soup.findAll('li', attrs={'class':" ucqo_g_solution"})
''' dict str convert '''
question = str(question)
raw_answer = str(raw_answer)
print
#print extract_2
print
""" to dict """
newdict = {"Question": question,
"Answer": raw_answer,}
print type(newdict)
print newdict
'''
repr(extract_2)
json_data_2 = json.dumps(extract_2)
print json_data_2
'''
try:
with open("%s.json" % list_url, "wb") as f:
json.dump(newdict, f, indent=4, encoding="UTF-8", ensure_ascii=False)
except IOError:
print("Oops, file error...")
def do(counts):
'''
counts = range(493174, 493183)
for list_url in counts:
print list_url
crawler(list_url)
'''
counts = 493174
list_url = 493174
crawler(list_url)
gevent.joinall([
gevent.spawn(do,"counts")
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment