Last active
October 21, 2016 03:22
-
-
Save SymeonChen/8fd61b895bdf0e1e471802fe054e123a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: UTF-8 -*- | |
import requests | |
import re | |
from lxml import etree | |
import pprint | |
s = requests.session() | |
# for num in range(1,12): | |
# url = 'https://www.v2ex.com/t/313225?p='+str(num) | |
# r = s.get(url) | |
# with open('page'+str(num)+'.html', 'w') as f: | |
# f.write(r.text) | |
# print('Download page',num,'successful!') | |
name_list=[] | |
num_list=[] | |
for num in range(1,12): | |
with open('page'+str(num)+'.html','rb') as f: | |
r = f.read() | |
page = etree.HTML(r) | |
user_name = page.xpath(u'//*[@id="Main"]/div[4]//table/tr/td[3]/strong/a') | |
user_comment = page.xpath(u'//*[@id="Main"]/div[4]//table/tr/td[3]/div[4]') | |
user_floor = page.xpath(u'//*[@id="Main"]/div[4]//table/tr/td[3]/div[1]/span') | |
for (name,comment,floor) in zip(user_name,user_comment,user_floor): | |
num_re = re.compile('[0-9]+') | |
comment_num = num_re.findall(comment.text) | |
if comment_num: | |
comment_num = comment_num[0] | |
else: | |
#someone does not joined the game,999 is a mark | |
comment_num = 999 | |
#someone first comment doesn't contain number | |
if name.text in name_list and comment_num != 999: | |
pass | |
#ignore useless answer like 999、66666666、233333 | |
if int(comment_num) > 100: | |
pass | |
else: | |
name_list.append(name.text) | |
# print(name.text,'+',int(comment_num)) | |
num_list.append(int(comment_num)) | |
if(int(comment_num)==84): | |
print(name.text) | |
print(floor.text) | |
#sort by number of times | |
from collections import Counter | |
result = Counter(num_list) | |
pprint.pprint(result) | |
#sort by number | |
result = [0 for x in range(0,100)] | |
for i in num_list: | |
result[i]+=1 | |
pprint.pprint(dict(zip(range(0,100),result))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment