Skip to content

Instantly share code, notes, and snippets.

@vstoykov
Last active December 19, 2015 22:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vstoykov/6028987 to your computer and use it in GitHub Desktop.
Save vstoykov/6028987 to your computer and use it in GitHub Desktop.
Simple parser to get all meta tags
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Fetch SEO data from given URL
"""
from HTMLParser import HTMLParser
from urllib2 import build_opener
class SeoParser(HTMLParser):
"""
Parser which will get all SEO meta tags information
"""
CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4')
ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br')
def __init__(self, url, default_charset='utf-8'):
HTMLParser.__init__(self)
self.url = url
self._last_tag = None
self._in_head = False
self._in_title = False
self._in_body = False
self._content_stack = 0
self._in_content = False
self._in_content_tag = False
self.charset = default_charset
self.title = None
self.meta_tags = {}
self.og_tags = {}
self.content = ''
def handle_starttag(self, tag, attrs):
""" Choose what to do when open a tag """
attrs = dict(attrs)
tag = tag.lower()
if self._in_head:
if tag == 'title':
self._in_title = True
elif tag == 'meta':
if 'charset' in attrs:
self.charset = attrs['charset']
else:
meta_property = attrs.get('property') or ''
meta_name = attrs.get('name') or ''
meta_content = attrs.get('content')
if meta_property.startswith('og:'):
self.og_tags[meta_property[3:]] = meta_content
elif meta_name:
self.meta_tags[meta_name] = meta_content
elif self._in_body:
if self._in_content:
if tag in self.CONTENT_TAGS:
self._in_content_tag = True
self.content += "<%s>" % tag
elif tag in ['div', 'section', 'article']:
tag_class = (attrs.get('class') or '').lower()
tag_id = (attrs.get('id') or '').lower()
if 'content' in tag_class or 'body' in tag_class:
self._in_content = True
elif 'content' in tag_id:
self._in_content = True
elif tag == 'head':
self._in_head = True
elif tag == 'body':
self._in_head = False
self._in_body = True
if self._in_content:
self._content_stack += 1
self._last_tag = tag
def handle_data(self, data):
""" Chose what to do with text nodes """
data = data.strip()
if not data:
return
if self._in_title:
# TODO: Remove new lines in title
self.title = data
if self._in_content_tag:
# Add space before content in some circumstances
if self.content[-1] not in '> ' and data[0] not in ',."':
data = ' ' + data
self.content += data
def handle_endtag(self, tag):
""" Choose what to do when close a tag """
tag = tag.lower()
if self._in_head:
if self._in_title:
self._in_title = False
if tag == 'head':
self._in_head = False
elif self._in_body:
if self._in_content:
if tag in self.CONTENT_TAGS:
self._in_content_tag = False
self.content += "</%s>\n" % tag
self._content_stack -= 1
if tag == 'body':
self._content_stack = 0
self._in_body = False
if self._content_stack == 0:
self._in_content = False
def get_seo_tags(self):
opener = build_opener()
# Add custom header to identify the parser
opener.addheaders = [('User-agent', 'Mozilla/5.0 SEO meta tags parser (https://gist.github.com/vstoykov/6028987)')]
response = opener.open(self.url).read()
self.feed(response)
self._fix_encoding()
return {
'title': self.title,
'meta': self.meta_tags,
'og': self.og_tags,
'content': self.content,
}
def unescape(self, s):
# Make unescaping more exception safe
super_unescape = HTMLParser.unescape
try:
return super_unescape(self, s.decode(self.charset))
except UnicodeDecodeError:
try:
return super_unescape(self, s)
except UnicodeDecodeError:
return s
def _fix_encoding(self):
for attr in ['title', 'content']:
val = getattr(self, attr)
if val:
setattr(self, attr, val.decode(self.charset))
for attr in ['meta_tags', 'og_tags']:
items = getattr(self, attr)
for key, val in items.iteritems():
if not val:
continue
val = val.replace(u'\xa0', ' ')
try:
items[key] = val.decode(self.charset)
except UnicodeEncodeError:
items[key] = val
def get_seo_tags(url):
parser = SeoParser(url)
return parser.get_seo_tags()
def main():
""" Main Function """
print get_seo_tags('http://magicsolutions.bg/')
if __name__ == '__main__':
exit(main() or 0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment