Skip to content

Instantly share code, notes, and snippets.

@borgle
Created November 12, 2013 02:55
Show Gist options
  • Save borgle/c846d07cdaa70933fd30 to your computer and use it in GitHub Desktop.
Save borgle/c846d07cdaa70933fd30 to your computer and use it in GitHub Desktop.
我的python html抓取代码。 data为一个dict值,是需要post的数据字典。
#!/usr/bin/python
# -*- coding: utf-8 -*-
import socket,urllib,time,urllib2
import re,StringIO,gzip
def gethtml(pageurl, data=None):
theheaders = {
'User-agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Accept-Encoding' : 'gzip, deflate',
}
req = urllib2.Request(pageurl, data, theheaders)
try:
opener = urllib2.build_opener()
handle = opener.open(req, timeout=10)
bodystr = handle.read()
except IOError, e:
print 'Open failed: %s. %s' % (pageurl, data)
if hasattr(e, 'code'):
print 'Error code - %s.' % e.code
time.sleep(0.8)
return gethtml(pageurl, data)
if handle.headers.has_key('content-encoding') and handle.headers['content-encoding']=='gzip':
compressedstream = StringIO.StringIO(bodystr)
try:
gzipper = gzip.GzipFile(fileobj=compressedstream, mode="r")
html = gzipper.read()
except:
html = gzipper.extrabuf
else:
html = bodystr
if handle.headers['content-type'].lower().count('utf-8'):
html = html.decode('utf-8')
headstr = ''.join(handle.headers.headers)
#print headstr
print html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment