Skip to content

Instantly share code, notes, and snippets.

@yono
Created December 18, 2009 02:41
Show Gist options
  • Save yono/259244 to your computer and use it in GitHub Desktop.
Save yono/259244 to your computer and use it in GitHub Desktop.
Webからhtml文書を取得
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import socket
import urllib2
import re
from html5lib import HTMLParser,treebuilders
headers = {
"User-Agent":"Mozilla/4.0 (compatileb; MSIE 7.0; Windows NT 5.1)"
}
socket.setdefaulttimeout(30)
"""
return (isexist, istext, body, url)
"""
def get_htmlinfo(orgurl):
request = urllib2.Request(orgurl,headers=headers)
try:
result = urllib2.urlopen(request)
except Exception, e:
print e.__class__, e
return (False, False, '', orgurl,'')
url = result.geturl()
contenttype = result.info()['Content-Type']
if not contenttype.startswith('text'):
print '%s is not text' % (url)
return (True, False, '', url, '')
html = result.read()
parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
soup = parser.parse(html)
html = soup.prettify()
title = re.sub(r"<.+?>","",unicode(soup.head.title))
return (True, True, html, url, title)
if __name__=='__main__':
import sys
url = sys.argv[1]
print url
result = get_htmlinfo(url)
print result[4]
print result[3]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment