Skip to content

Instantly share code, notes, and snippets.

@lrvick
Created February 23, 2011 01:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save lrvick/839795 to your computer and use it in GitHub Desktop.
Save lrvick/839795 to your computer and use it in GitHub Desktop.
Extract html <title> from url and universally return as unicode
import urllib2,httplib,re
def url_title(url,**kwargs):
title = None
request = urllib2.Request(url)
try:
response = urllib2.urlopen(request)
data = response.read()
except urllib2.HTTPError:
data = None
except urllib2.URLError:
data = None
except httplib.BadStatusLine:
data = None
except httplib.InvalidURL:
data = None
if data:
if '<title>' in data:
headers = response.info()
raw_encoding = headers['content-type'].split('charset=')[-1]
if 'text/html' in raw_encoding:
encoding = 'unicode-escape'
else:
encoding = raw_encoding
title_search = re.search('(?<=<title>).*(?=<\/title>)',data)
if title_search:
try:
if encoding:
title = unicode(title_search.group(0),encoding)
else:
title = title_search.group(0)
except Exception, e:
print(e)
return "%s | %s" % (encoding,title)
sample_urls = [
'http://news.cnet.com/8301-31021_3-20035058-260.html?part=rss&subj=news&tag=2547-1_3-0-20', # ISO-8859-1
'http://www.androidphonethemes.com/nec-medias-n-04c-is-7-7mm-think-%e2%80%93-runs-android-2-2/', # UTF-8
'http://www.voiceblog.jp/semaasa/', # EUC-JP
'http://taisyo.seesaa.net/article/187352622.html', # Shift_JIS
'http://www.tuaw.com/2011/02/22/element-vapor-pro-iphone-4-case-exclusive-first-look-and-giveaw/', # text/html (does not contain unicode chars)
'http://www.atpress.ne.jp/view/19160', # text/html (contains unicode)
'http://we-newsing.com/archives/128', # text/html (contains unicode)
'http://live.nicovideo.jp/watch/lv41334754', # text/html (contains unicode)
'http://www.superdownloads.com.br/materias/acionistas-da-apple-querem-substituto-steve-jobs.html', # utf8 (claims utf8 but it /LIES/)
'http://http://g3dfsdf.com', # None. Broken url
'http://g3dfsdf.com', # None. Dead url
]
for url in sample_urls:
print url_title(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment