Skip to content

Instantly share code, notes, and snippets.

@DrustZ
Last active May 19, 2016 12:15
Show Gist options
  • Save DrustZ/0c20a6a0900460969d0de735d9fa1f96 to your computer and use it in GitHub Desktop.
Save DrustZ/0c20a6a0900460969d0de735d9fa1f96 to your computer and use it in GitHub Desktop.
parser
# -*- coding: utf-8 -*-
from multiprocessing.dummy import Pool as ThreadPool
from lxml import html
import requests
import string
THREADS=10
punctuation = '!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
users = []
userf = open('users.txt','r+')
for line in userf.readlines():
users.append(line.rstrip())
def scrapeQA(url):
global headers
try:
qa_page = requests.get(url, headers=headers)
qa_tree = html.fromstring(qa_page.content)
content = qa_tree.xpath('//div[@class="resolved-cnt"] | //div[@class="other-ans-cnt"]')
if len(content) == 0:
return []
return content
except:
return []
def scrapeUser(userN):
i = 0
cnt = 0
pre_url = 'http://wenda.so.com'
userN = userN.split('/')[-1]
pre_list_url = pre_url + '/u/an/' + userN + '?pn=';
try:
with open('n360/'+userN.translate(None, string.punctuation)+'.txt','w') as f:
while True:
q_url_list = []
list_url = pre_list_url + `i`
list_page = requests.get(list_url)
list_tree = html.fromstring(list_page.content)
items = list_tree.xpath('//li[@class="item" and @ans_id]/a')
ansers = []
if len(items) == 0:
break
for item_a in items:
qurl = pre_url+item_a.get('href')
ansers = ansers+scrapeQA(qurl)
for ans in ansers:
f.write(ans.text_content().encode('utf-8').replace('\n',' ')+'\n')
cnt += len(items)
print 'ready for a list, now %d items frome user %s' % (cnt , userN)
i += 1
f.close()
except:
return
def scrapeCate(url):
global users
cnt = 0
i = 1
usrs = []
while True:
tar_url = url+`i`
page = requests.get(tar_url)
tree = html.fromstring(page.content)
people = tree.xpath('//span[@class="name"]/a')
if len(people) == 0:
break
for a in people:
usr = a.get('href')
if usr in users:
continue
usrs.append(usr)
if len(usrs) == THREADS:
pool = ThreadPool(processes=THREADS)
pool.map(scrapeUser, usrs)
pool.close()
pool.join()
#scrapeUser(usr)
users += usrs
for usr in usrs:
userf.write(usr+'\n');
cnt += THREADS
print 'ready for %d user' % cnt
usrs = []
i += 1
scrapeCate('http://wenda.so.com/rank?type=5&cid=-1&pn=')
userf.close();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment