Created
January 23, 2012 04:37
-
-
Save laohyx/1660611 to your computer and use it in GitHub Desktop.
LaoWeb - Laohyx web processor in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# coding=utf-8 | |
import urllib2,cookielib,urllib,re,time | |
''' | |
LaoWeb HTTP封装类 v0.2 2012/1/22 除夕夜 | |
Http操作模拟类,支持正则等简易功能。 | |
那些GPL就不废话了,转载修改请注明原作者。 | |
laohyx@163.com | |
By Laohyx. | |
''' | |
def reSearch(pattern,string,split = 0): | |
p = re.compile(pattern) | |
if split == 1: | |
s = "".join(string.split()) | |
else: | |
s = string | |
match = p.search(s) | |
if match: | |
return match.group(1) | |
else: | |
print "NOT MATCHED", pattern | |
#print s | |
return "" | |
class NoExceptionCookieProcesser(urllib2.HTTPCookieProcessor): | |
''' | |
出现http错误时,不出现异常 | |
''' | |
def http_error_110(self, req, fp, code, msg, hdrs): | |
return fp | |
def http_error_403(self, req, fp, code, msg, hdrs): | |
return fp | |
def http_error_404(self, req, fp, code, msg, hdrs): | |
return fp | |
def http_error_400(self, req, fp, code, msg, hdrs): | |
return fp | |
def http_error_500(self, req, fp, code, msg, hdrs): | |
return fp | |
def http_error_501(self, req, fp, code, msg, hdrs): | |
return fp | |
def http_error_502(self, req, fp, code, msg, hdrs): | |
return fp | |
def http_error_503(self, req, fp, code, msg, hdrs): | |
return fp | |
class LaoWeb(): | |
''' | |
LaoWeb类,支持GET,POST,文件上传等功能。自动保存cookie。 | |
保存返回的html(self.html),和头信息(self.info) | |
''' | |
def __init__(self): | |
''' | |
初始化函数 | |
''' | |
self.cookie = cookielib.LWPCookieJar() | |
self.opener = urllib2.build_opener(NoExceptionCookieProcesser(self.cookie)) | |
self.opener.addheaders = [('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)')] | |
self._headers = [] | |
self.html = "" | |
def add_headers(self,headers): | |
''' | |
增加自定义header | |
headers : 以(key,val) 为单位的list | |
''' | |
self._headers += headers | |
def remove_headers(self,headers): | |
''' | |
删除自定义header | |
headers : 以(key,val) 为单位的list | |
''' | |
for k,v in self._headers: | |
self._headers.remove((k,v)) | |
def post(self,url,postfield = {}): | |
''' | |
post方式提交表单,表单内容自动url_encode | |
''' | |
params = urllib.urlencode(postfield) | |
self.response = self.opener.open(urllib2.Request(url,params)) | |
self.html = self.response.read() | |
return self.html | |
def get(self,url,feild = {}): | |
''' | |
GET方式提交表单,表单内容自动url_encode | |
''' | |
params = urllib.urlencode(feild) | |
if(len(params) > 0): | |
url = url + "?" + params | |
req = urllib2.Request(url) | |
for k,v in self._headers: | |
req.add_header(k,v) | |
self.response = self.opener.open(req) | |
self.html = self.response.read() | |
return self.html | |
def upload(self,url, data): | |
''' | |
以POST方式上传文件,以multipart/form-data编码 | |
''' | |
params, boundary = self._encode_multipart(data) | |
req = urllib2.Request(url, data=params) | |
req.add_header('Content-Type', 'multipart/form-data; boundary=%s' % boundary) | |
self.response = self.opener.open(req) | |
self.html = self.response.read() | |
return self.html | |
def _encode_multipart(self,kw): | |
''' | |
Build a multipart/form-data body with generated random boundary. | |
''' | |
boundary = '----------%s' % hex(int(time.time() * 1000)) | |
data = [] | |
for k, v in kw.iteritems(): | |
data.append('--%s' % boundary) | |
if hasattr(v, 'read'): | |
# file-like object: | |
ext = '' | |
filename = getattr(v, 'name', '') | |
n = filename.rfind('.') | |
if n != (-1): | |
ext = filename[n:].lower() | |
content = v.read() | |
data.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (k,v.name)) | |
data.append('Content-Length: %d' % len(content)) | |
data.append('Content-Type: %s\r\n' % self._guess_content_type(ext)) | |
data.append(content) | |
else: | |
data.append('Content-Disposition: form-data; name="%s"\r\n' % k) | |
data.append(v.encode('utf-8') if isinstance(v, unicode) else v) | |
data.append('--%s--\r\n' % boundary) | |
return '\r\n'.join(data), boundary | |
def _guess_content_type(self,ext): | |
_CONTENT_TYPES = { '.png': 'image/png', '.gif': 'image/gif', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.jpe': 'image/jpeg' } | |
return _CONTENT_TYPES.get(ext, 'application/octet-stream') | |
def unescape(s): | |
''' | |
将html中的符号解析 | |
''' | |
s = s.replace("<", "<") | |
s = s.replace(">", ">") | |
# this has to be last: | |
s = s.replace("&", "&") | |
return s | |
def main(): | |
web = LaoWeb() | |
web.add_headers([("ad","dfd")]) | |
web.add_headers([("Author","Laohyx")]) | |
web.remove_headers([("ad","dfd")]) | |
web.get("http://renren.com") | |
print web.html | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment