Skip to content

Instantly share code, notes, and snippets.

@laohyx
Created January 23, 2012 04:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save laohyx/1660611 to your computer and use it in GitHub Desktop.
Save laohyx/1660611 to your computer and use it in GitHub Desktop.
LaoWeb - Laohyx web processor in python
#!/usr/bin/python
# coding=utf-8
import urllib2,cookielib,urllib,re,time
'''
LaoWeb HTTP封装类 v0.2 2012/1/22 除夕夜
Http操作模拟类,支持正则等简易功能。
那些GPL就不废话了,转载修改请注明原作者。
laohyx@163.com
By Laohyx.
'''
def reSearch(pattern,string,split = 0):
p = re.compile(pattern)
if split == 1:
s = "".join(string.split())
else:
s = string
match = p.search(s)
if match:
return match.group(1)
else:
print "NOT MATCHED", pattern
#print s
return ""
class NoExceptionCookieProcesser(urllib2.HTTPCookieProcessor):
'''
出现http错误时,不出现异常
'''
def http_error_110(self, req, fp, code, msg, hdrs):
return fp
def http_error_403(self, req, fp, code, msg, hdrs):
return fp
def http_error_404(self, req, fp, code, msg, hdrs):
return fp
def http_error_400(self, req, fp, code, msg, hdrs):
return fp
def http_error_500(self, req, fp, code, msg, hdrs):
return fp
def http_error_501(self, req, fp, code, msg, hdrs):
return fp
def http_error_502(self, req, fp, code, msg, hdrs):
return fp
def http_error_503(self, req, fp, code, msg, hdrs):
return fp
class LaoWeb():
'''
LaoWeb类,支持GET,POST,文件上传等功能。自动保存cookie。
保存返回的html(self.html),和头信息(self.info)
'''
def __init__(self):
'''
初始化函数
'''
self.cookie = cookielib.LWPCookieJar()
self.opener = urllib2.build_opener(NoExceptionCookieProcesser(self.cookie))
self.opener.addheaders = [('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)')]
self._headers = []
self.html = ""
def add_headers(self,headers):
'''
增加自定义header
headers : 以(key,val) 为单位的list
'''
self._headers += headers
def remove_headers(self,headers):
'''
删除自定义header
headers : 以(key,val) 为单位的list
'''
for k,v in self._headers:
self._headers.remove((k,v))
def post(self,url,postfield = {}):
'''
post方式提交表单,表单内容自动url_encode
'''
params = urllib.urlencode(postfield)
self.response = self.opener.open(urllib2.Request(url,params))
self.html = self.response.read()
return self.html
def get(self,url,feild = {}):
'''
GET方式提交表单,表单内容自动url_encode
'''
params = urllib.urlencode(feild)
if(len(params) > 0):
url = url + "?" + params
req = urllib2.Request(url)
for k,v in self._headers:
req.add_header(k,v)
self.response = self.opener.open(req)
self.html = self.response.read()
return self.html
def upload(self,url, data):
'''
以POST方式上传文件,以multipart/form-data编码
'''
params, boundary = self._encode_multipart(data)
req = urllib2.Request(url, data=params)
req.add_header('Content-Type', 'multipart/form-data; boundary=%s' % boundary)
self.response = self.opener.open(req)
self.html = self.response.read()
return self.html
def _encode_multipart(self,kw):
'''
Build a multipart/form-data body with generated random boundary.
'''
boundary = '----------%s' % hex(int(time.time() * 1000))
data = []
for k, v in kw.iteritems():
data.append('--%s' % boundary)
if hasattr(v, 'read'):
# file-like object:
ext = ''
filename = getattr(v, 'name', '')
n = filename.rfind('.')
if n != (-1):
ext = filename[n:].lower()
content = v.read()
data.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (k,v.name))
data.append('Content-Length: %d' % len(content))
data.append('Content-Type: %s\r\n' % self._guess_content_type(ext))
data.append(content)
else:
data.append('Content-Disposition: form-data; name="%s"\r\n' % k)
data.append(v.encode('utf-8') if isinstance(v, unicode) else v)
data.append('--%s--\r\n' % boundary)
return '\r\n'.join(data), boundary
def _guess_content_type(self,ext):
_CONTENT_TYPES = { '.png': 'image/png', '.gif': 'image/gif', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.jpe': 'image/jpeg' }
return _CONTENT_TYPES.get(ext, 'application/octet-stream')
def unescape(s):
'''
将html中的符号解析
'''
s = s.replace("&lt;", "<")
s = s.replace("&gt;", ">")
# this has to be last:
s = s.replace("&amp;", "&")
return s
def main():
web = LaoWeb()
web.add_headers([("ad","dfd")])
web.add_headers([("Author","Laohyx")])
web.remove_headers([("ad","dfd")])
web.get("http://renren.com")
print web.html
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment