vstoykov/seo_parser.py

## seo_parser.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Fetch SEO data from given URL
"""
from HTMLParser import HTMLParser
from urllib2 import build_opener


class SeoParser(HTMLParser):
    """
    Parser which will get all SEO meta tags information

    """
    CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4')
    ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br')

    def __init__(self, url, default_charset='utf-8'):
        HTMLParser.__init__(self)
        self.url = url
        self._last_tag = None
        self._in_head = False
        self._in_title = False
        self._in_body = False
        self._content_stack = 0
        self._in_content = False
        self._in_content_tag = False
        self.charset = default_charset
        self.title = None
        self.meta_tags = {}
        self.og_tags = {}
        self.content = ''

    def handle_starttag(self, tag, attrs):
        """ Choose what to do when open a tag """
        attrs = dict(attrs)
        tag = tag.lower()
        if self._in_head:
            if tag == 'title':
                self._in_title = True
            elif tag == 'meta':
                if 'charset' in attrs:
                    self.charset = attrs['charset']
                else:
                    meta_property = attrs.get('property') or ''
                    meta_name = attrs.get('name') or ''
                    meta_content = attrs.get('content')

                    if meta_property.startswith('og:'):
                        self.og_tags[meta_property[3:]] = meta_content
                    elif meta_name:
                        self.meta_tags[meta_name] = meta_content

        elif self._in_body:
            if self._in_content:
                if tag in self.CONTENT_TAGS:
                    self._in_content_tag = True
                    self.content += "<%s>" % tag

            elif tag in ['div', 'section', 'article']:
                tag_class = (attrs.get('class') or '').lower()
                tag_id = (attrs.get('id') or '').lower()
                if 'content' in tag_class or 'body' in tag_class:
                    self._in_content = True
                elif 'content' in tag_id:
                    self._in_content = True

        elif tag == 'head':
            self._in_head = True

        elif tag == 'body':
            self._in_head = False
            self._in_body = True

        if self._in_content:
            self._content_stack += 1

        self._last_tag = tag

    def handle_data(self, data):
        """ Chose what to do with text nodes """
        data = data.strip()
        if not data:
            return

        if self._in_title:
            # TODO: Remove new lines in title
            self.title = data

        if self._in_content_tag:
            # Add space before content in some circumstances
            if self.content[-1] not in '> ' and data[0] not in ',."':
                data = ' ' + data
            self.content += data

    def handle_endtag(self, tag):
        """ Choose what to do when close a tag """
        tag = tag.lower()
        if self._in_head:
            if self._in_title:
                self._in_title = False
            if tag == 'head':
                self._in_head = False
        elif self._in_body:
            if self._in_content:
                if tag in self.CONTENT_TAGS:
                    self._in_content_tag = False
                    self.content += "</%s>\n" % tag

                self._content_stack -= 1

            if tag == 'body':
                self._content_stack = 0
                self._in_body = False

        if self._content_stack == 0:
            self._in_content = False

    def get_seo_tags(self):
        opener = build_opener()
        # Add custom header to identify the parser
        opener.addheaders = [('User-agent', 'Mozilla/5.0 SEO meta tags parser (https://gist.github.com/vstoykov/6028987)')]
        response = opener.open(self.url).read()
        self.feed(response)
        self._fix_encoding()

        return {
            'title': self.title,
            'meta': self.meta_tags,
            'og': self.og_tags,
            'content': self.content,
        }

    def unescape(self, s):
        # Make unescaping more exception safe
        super_unescape = HTMLParser.unescape
        try:
            return super_unescape(self, s.decode(self.charset))
        except UnicodeDecodeError:
            try:
                return super_unescape(self, s)
            except UnicodeDecodeError:
                return s

    def _fix_encoding(self):
        for attr in ['title', 'content']:
            val = getattr(self, attr)
            if val:
                setattr(self, attr, val.decode(self.charset))

        for attr in ['meta_tags', 'og_tags']:
            items = getattr(self, attr)
            for key, val in items.iteritems():
                if not val:
                    continue
                val = val.replace(u'\xa0', ' ')
                try:
                    items[key] = val.decode(self.charset)
                except UnicodeEncodeError:
                    items[key] = val


def get_seo_tags(url):
    parser = SeoParser(url)
    return parser.get_seo_tags()

def main():
    """ Main Function """
    print get_seo_tags('http://magicsolutions.bg/')

if __name__ == '__main__':
    exit(main() or 0)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	Fetch SEO data from given URL
	"""
	from HTMLParser import HTMLParser
	from urllib2 import build_opener



	class SeoParser(HTMLParser):
	"""
	Parser which will get all SEO meta tags information

	"""
	CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4')
	ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br')

	def __init__(self, url, default_charset='utf-8'):
	HTMLParser.__init__(self)
	self.url = url
	self._last_tag = None
	self._in_head = False
	self._in_title = False
	self._in_body = False
	self._content_stack = 0
	self._in_content = False
	self._in_content_tag = False
	self.charset = default_charset
	self.title = None
	self.meta_tags = {}
	self.og_tags = {}
	self.content = ''

	def handle_starttag(self, tag, attrs):
	""" Choose what to do when open a tag """
	attrs = dict(attrs)
	tag = tag.lower()
	if self._in_head:
	if tag == 'title':
	self._in_title = True
	elif tag == 'meta':
	if 'charset' in attrs:
	self.charset = attrs['charset']
	else:
	meta_property = attrs.get('property') or ''
	meta_name = attrs.get('name') or ''
	meta_content = attrs.get('content')

	if meta_property.startswith('og:'):
	self.og_tags[meta_property[3:]] = meta_content
	elif meta_name:
	self.meta_tags[meta_name] = meta_content

	elif self._in_body:
	if self._in_content:
	if tag in self.CONTENT_TAGS:
	self._in_content_tag = True
	self.content += "<%s>" % tag

	elif tag in ['div', 'section', 'article']:
	tag_class = (attrs.get('class') or '').lower()
	tag_id = (attrs.get('id') or '').lower()
	if 'content' in tag_class or 'body' in tag_class:
	self._in_content = True
	elif 'content' in tag_id:
	self._in_content = True

	elif tag == 'head':
	self._in_head = True

	elif tag == 'body':
	self._in_head = False
	self._in_body = True

	if self._in_content:
	self._content_stack += 1

	self._last_tag = tag

	def handle_data(self, data):
	""" Chose what to do with text nodes """
	data = data.strip()
	if not data:
	return

	if self._in_title:
	# TODO: Remove new lines in title
	self.title = data

	if self._in_content_tag:
	# Add space before content in some circumstances
	if self.content[-1] not in '> ' and data[0] not in ',."':
	data = ' ' + data
	self.content += data

	def handle_endtag(self, tag):
	""" Choose what to do when close a tag """
	tag = tag.lower()
	if self._in_head:
	if self._in_title:
	self._in_title = False
	if tag == 'head':
	self._in_head = False
	elif self._in_body:
	if self._in_content:
	if tag in self.CONTENT_TAGS:
	self._in_content_tag = False
	self.content += "</%s>\n" % tag

	self._content_stack -= 1

	if tag == 'body':
	self._content_stack = 0
	self._in_body = False

	if self._content_stack == 0:
	self._in_content = False

	def get_seo_tags(self):
	opener = build_opener()
	# Add custom header to identify the parser
	opener.addheaders = [('User-agent', 'Mozilla/5.0 SEO meta tags parser (https://gist.github.com/vstoykov/6028987)')]
	response = opener.open(self.url).read()
	self.feed(response)
	self._fix_encoding()

	return {
	'title': self.title,
	'meta': self.meta_tags,
	'og': self.og_tags,
	'content': self.content,
	}

	def unescape(self, s):
	# Make unescaping more exception safe
	super_unescape = HTMLParser.unescape
	try:
	return super_unescape(self, s.decode(self.charset))
	except UnicodeDecodeError:
	try:
	return super_unescape(self, s)
	except UnicodeDecodeError:
	return s

	def _fix_encoding(self):
	for attr in ['title', 'content']:
	val = getattr(self, attr)
	if val:
	setattr(self, attr, val.decode(self.charset))

	for attr in ['meta_tags', 'og_tags']:
	items = getattr(self, attr)
	for key, val in items.iteritems():
	if not val:
	continue
	val = val.replace(u'\xa0', ' ')
	try:
	items[key] = val.decode(self.charset)
	except UnicodeEncodeError:
	items[key] = val


	def get_seo_tags(url):
	parser = SeoParser(url)
	return parser.get_seo_tags()

	def main():
	""" Main Function """
	print get_seo_tags('http://magicsolutions.bg/')

	if __name__ == '__main__':
	exit(main() or 0)