Skip to content

Instantly share code, notes, and snippets.

@582033
Created July 6, 2016 07:32
Show Gist options
  • Save 582033/22bf2693b1125e754f4cd84663cda178 to your computer and use it in GitHub Desktop.
Save 582033/22bf2693b1125e754f4cd84663cda178 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
import json
import requests
from bs4 import BeautifulSoup
class crawler():
def __init__(self, host):
self.host = host
self.form_data = {
'__VIEWSTATEGENERATOR':'69F4E07F',
'__VIEWSTATE':'/wEPDwULLTE1ODQzMDg1NDMPZBYCAgEPZBYKZg8PFgIeBFRleHQFyQHkvaDnmoQgSVA6IDEwNy4xOTEuMTE2LjI1PEJSPjxpbWcgc3JjPScuLi9pbWFnZXMvZmxhZ3MvVVMucG5nJyB3aWR0aD0nMzInIGhlaWdodD0nMzInIC8+PEJSPuS9oOeahOWbveWutuaIluWcsOWMujogVW5pdGVkIFN0YXRlczxCUj48YSBocmVmPScjTElTVCc+6YCa6L+H5L2/55SoIFZQTiBHYXRlIOadpeabtOaUueS9oOeahCBJUCDlnLDlnYAgITwvYT5kZAIBDw8WAh8ABXI8Yj7lnKggMjQg5bCP5pe25LmL5YaF55qEOiAxLDUzMSwzNjYg55So5oi377yM57Sv6K6h55So5oi35pWwOiAxLDIwNSwxNzEsMzExIOeUqOaIt++8jOmAmuS/oemHjzogMjAsOTg4Ljc3IFRCLjwvYj5kZAIDDw8WAh8ABQUzLDI0OWRkAgQPDxYCHwAFPDxiPuacieadpeiHqiAyMjcg5Liq5Zu95a6255qEIDEsMjA1LDE3MSwzMTEg5Liq55So5oi344CCPC9iPmRkAgYPDxYCHwAFBDY5NzFkZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WBAULQ19Tb2Z0RXRoZXIFBkNfTDJUUAUJQ19PcGVuVlBOBQZDX1NTVFBNhR6QKODAughVK7og96/pnNOg+MsSRdBoTwK8F1yUzw==',
'__EVENTVALIDATION':'/wEdAAeOnCU/WmcukVQ1Rszt4PpuZSmLidaMQ3gg2jFmkkuEoSCbR2H52ATFMg5mk6aQHX3LISMg9/mywZPt3Ki4BVA7RhcLWIOHmHJ6h2VtXvwLieWw6g9beu/2J/0raZOGI2E/WMskeKo19Gyidl+m11dTlWT5u5EoXokaDMPJeszCVBDwGRQM2BJm5pkQt2UxSGc=',
'C_L2TP':'on',
#'C_SoftEther':'on',
#'C_OpenVPN':'on',
'foo':'bar' #用于补全结尾,无实际意义
}
def get_l2tp(self):
r = requests.post(self.host, data=self.form_data)
soup = BeautifulSoup(r.content)
tmp_list = []
check_stack = []
for td in soup.findAll('span', attrs={'style':'font-size: 12pt;'}):
if td.string:
if re.search(r'\d+\.\d+\.\d+\.\d+', td.string):
if td.string not in check_stack:
list_attr = {}
#获取IP
list_attr['ip'] = td.string
#获取等级评分
rank_list = td.parent.parent.parent.findAll('span', text=re.compile('\d+,\d+'))
list_attr['rank'] = int(rank_list[0].string.replace(',',''))
#获取当前会话数
session_list = td.parent.parent.parent.findAll('span', text=re.compile('\d+\ .*'))
list_attr['session'] = re.sub('(\d+).*', '\\1', session_list[0].string)
#获取吞吐量,单位Mbps
tp_list = td.parent.parent.parent.findAll('span', text=re.compile('\d+\ Mbps'))
list_attr['throughput'] = re.sub('(\d+\.\d+)\ \Mbps', '\\1', tp_list[0].string)
tmp_list.append(list_attr)
check_stack.append(td.string)
#按rank降序重排
sorted(tmp_list, key = lambda x:x['rank'], reverse=True)
print json.dumps(tmp_list)
if __name__ == '__main__':
host = 'http://www.vpngate.net/cn/'
cw = crawler(host)
cw.get_l2tp()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment