Skip to content

Instantly share code, notes, and snippets.

@recall704
Created May 7, 2015 13:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save recall704/578cc46ab191557c2424 to your computer and use it in GitHub Desktop.
Save recall704/578cc46ab191557c2424 to your computer and use it in GitHub Desktop.
爬取 http://pachong.org/ 中的代理ip 和 port
#coding:utf-8
import re
import requests
url = 'http://pachong.org/high.html'
req = requests.get(url)
if req.status_code == 200:
html = req.text
else:
html = ''
# 匹配ip 和 端口对应 js
p = re.compile(r'''<tr data-id="\d+" data-type=\"high\">.*?<td.*?</td>.*?<td>(?P<ip>.*?)</td>.*?write\((?P<port>.*?)\);</script>''',re.S)
p2 = re.compile(r'var.*?;')
l1 = p.findall(html)
l2 = p2.findall(html)
l3 = []
for l in l2:
temp = l.strip('var')
temp = temp.strip()
l3.append(temp)
for l in l3:
exec(l)
# print l1
result = []
for l_t in l1:
if isinstance(l_t[1],basestring):
port_str = 'port=' + l_t[1]
exec(port_str)
t = (l_t[0],port)
result.append(t)
else:
pass
print result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment