Skip to content

Instantly share code, notes, and snippets.

@swshan
Last active October 24, 2015 13:39
Show Gist options
  • Save swshan/e6eb5a71153ed72ca6fb to your computer and use it in GitHub Desktop.
Save swshan/e6eb5a71153ed72ca6fb to your computer and use it in GitHub Desktop.
pyquery.example.001.py
''' http://blog.csdn.net/CUMT_GPF/article/details/46390509 '''
import re
import sys
import requests
from pyquery import PyQuery as pq
reload(sys)
sys.setdefaultencoding('GBK')
#fp = open("d:/result.txt",'w')
url = 'http://xlfans.com/'
headers = {
'Accept-Language': 'zh-cn',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/4.0 (compatible MSIE 6.00 Windows NT 5.1 SV1)',
}
r = requests.post(url, headers = headers)
#print r.text
html_content = pq(r.text)
print r.encoding
print r.text
content = pq( html_content('.focus').html())
disurl = content.attr('href')
r = requests.get(disurl,headers = headers)
#print r.text
#f = open('d:/html_content.txt','w')
#print >>f,r.text
result = ''
m = re.findall('(.*<br />)',str(r.text))
for i in range(len(m)):
if re.match('.*href',m[i]):
continue
#print >>fp,m[i][0:-6]
result += m[i][0:-6]
result += '\n'
print result
#print >>fp, result
print "Please see the result..."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment