vadimkantorov/socialmediacard.py

## socialmediacard.py
# based on https://gist.github.com/vstoykov/6028987
# python socialmediacard.py 'https://meduza.io/feature/2024/02/28/ya-sdelayu-vse-chtoby-zlo-otstupilo-a-prekrasnoe-buduschee-prishlo'

import html.parser
import urllib.request

class SeoParser(html.parser.HTMLParser):
    CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4')
    ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br')

    def __init__(self, default_charset='utf-8'):
        super().__init__()
        self._last_tag = None
        self._in_head = False
        self._in_title = False
        self._in_body = False
        self._content_stack = 0
        self._in_content = False
        self._in_content_tag = False
        self.charset = default_charset
        self.title = None
        self.meta_tags = {}
        self.og_tags = {}
        self.content = ''

    def handle_starttag(self, tag, attrs):
        """ Choose what to do when open a tag """
        attrs = dict(attrs)
        tag = tag.lower()
        if self._in_head:
            if tag == 'title':
                self._in_title = True
            elif tag == 'meta':
                if 'charset' in attrs:
                    self.charset = attrs['charset']
                else:
                    meta_property = attrs.get('property') or ''
                    meta_name = attrs.get('name') or ''
                    meta_content = attrs.get('content')

                    if meta_property.startswith('og:'):
                        self.og_tags[meta_property[3:]] = meta_content
                    elif meta_name:
                        self.meta_tags[meta_name] = meta_content

        elif self._in_body:
            if self._in_content:
                if tag in self.CONTENT_TAGS:
                    self._in_content_tag = True
                    self.content += "<%s>" % tag

            elif tag in ['div', 'section', 'article']:
                tag_class = (attrs.get('class') or '').lower()
                tag_id = (attrs.get('id') or '').lower()
                if 'content' in tag_class or 'body' in tag_class:
                    self._in_content = True
                elif 'content' in tag_id:
                    self._in_content = True

        elif tag == 'head':
            self._in_head = True

        elif tag == 'body':
            self._in_head = False
            self._in_body = True

        if self._in_content:
            self._content_stack += 1

        self._last_tag = tag

    def handle_data(self, data):
        """ Chose what to do with text nodes """
        data = data.strip()
        if not data:
            return

        if self._in_title:
            # TODO: Remove new lines in title
            self.title = data

        if self._in_content_tag:
            # Add space before content in some circumstances
            if self.content[-1] not in '> ' and data[0] not in ',."':
                data = ' ' + data
            self.content += data

    def handle_endtag(self, tag):
        """ Choose what to do when close a tag """
        tag = tag.lower()
        if self._in_head:
            if self._in_title:
                self._in_title = False
            if tag == 'head':
                self._in_head = False
        elif self._in_body:
            if self._in_content:
                if tag in self.CONTENT_TAGS:
                    self._in_content_tag = False
                    self.content += "</%s>\n" % tag

                self._content_stack -= 1

            if tag == 'body':
                self._content_stack = 0
                self._in_body = False

        if self._content_stack == 0:
            self._in_content = False

    def get_seo_dict(self, translate = {ord('\xa0') : ' ' }):
        return dict(title = seo.title.translate(translate), content = seo.content.translate(translate), meta_tags = {k : v.translate(translate) for k, v in self.meta_tags.items()}, og_tags = {k : v.translate(translate) for k, v in self.og_tags.items()})

if __name__ == '__main__':
    import sys
    url = sys.argv[1]
    response_bytes = urllib.request.urlopen(url).read()
    response_text_utf8 = response_bytes.decode('utf-8')

    seo = SeoParser(default_charset = 'utf-8')
    seo.feed(response_text_utf8)
    if seo.charset != 'utf-8' and seo.charset != 'utf8':
        response_text_decoded = response_bytes.decode(seo.charset)
        seo = SeoParser(default_charset = 'utf-8')
        seo.feed(response_text_decoded)
    print(seo.get_seo_dict())
	# based on https://gist.github.com/vstoykov/6028987
	# python socialmediacard.py 'https://meduza.io/feature/2024/02/28/ya-sdelayu-vse-chtoby-zlo-otstupilo-a-prekrasnoe-buduschee-prishlo'

	import html.parser
	import urllib.request

	class SeoParser(html.parser.HTMLParser):
	CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4')
	ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br')

	def __init__(self, default_charset='utf-8'):
	super().__init__()
	self._last_tag = None
	self._in_head = False
	self._in_title = False
	self._in_body = False
	self._content_stack = 0
	self._in_content = False
	self._in_content_tag = False
	self.charset = default_charset
	self.title = None
	self.meta_tags = {}
	self.og_tags = {}
	self.content = ''

	def handle_starttag(self, tag, attrs):
	""" Choose what to do when open a tag """
	attrs = dict(attrs)
	tag = tag.lower()
	if self._in_head:
	if tag == 'title':
	self._in_title = True
	elif tag == 'meta':
	if 'charset' in attrs:
	self.charset = attrs['charset']
	else:
	meta_property = attrs.get('property') or ''
	meta_name = attrs.get('name') or ''
	meta_content = attrs.get('content')

	if meta_property.startswith('og:'):
	self.og_tags[meta_property[3:]] = meta_content
	elif meta_name:
	self.meta_tags[meta_name] = meta_content

	elif self._in_body:
	if self._in_content:
	if tag in self.CONTENT_TAGS:
	self._in_content_tag = True
	self.content += "<%s>" % tag

	elif tag in ['div', 'section', 'article']:
	tag_class = (attrs.get('class') or '').lower()
	tag_id = (attrs.get('id') or '').lower()
	if 'content' in tag_class or 'body' in tag_class:
	self._in_content = True
	elif 'content' in tag_id:
	self._in_content = True

	elif tag == 'head':
	self._in_head = True

	elif tag == 'body':
	self._in_head = False
	self._in_body = True

	if self._in_content:
	self._content_stack += 1

	self._last_tag = tag

	def handle_data(self, data):
	""" Chose what to do with text nodes """
	data = data.strip()
	if not data:
	return

	if self._in_title:
	# TODO: Remove new lines in title
	self.title = data

	if self._in_content_tag:
	# Add space before content in some circumstances
	if self.content[-1] not in '> ' and data[0] not in ',."':
	data = ' ' + data
	self.content += data

	def handle_endtag(self, tag):
	""" Choose what to do when close a tag """
	tag = tag.lower()
	if self._in_head:
	if self._in_title:
	self._in_title = False
	if tag == 'head':
	self._in_head = False
	elif self._in_body:
	if self._in_content:
	if tag in self.CONTENT_TAGS:
	self._in_content_tag = False
	self.content += "</%s>\n" % tag

	self._content_stack -= 1

	if tag == 'body':
	self._content_stack = 0
	self._in_body = False

	if self._content_stack == 0:
	self._in_content = False

	def get_seo_dict(self, translate = {ord('\xa0') : ' ' }):
	return dict(title = seo.title.translate(translate), content = seo.content.translate(translate), meta_tags = {k : v.translate(translate) for k, v in self.meta_tags.items()}, og_tags = {k : v.translate(translate) for k, v in self.og_tags.items()})

	if __name__ == '__main__':
	import sys
	url = sys.argv[1]
	response_bytes = urllib.request.urlopen(url).read()
	response_text_utf8 = response_bytes.decode('utf-8')

	seo = SeoParser(default_charset = 'utf-8')
	seo.feed(response_text_utf8)
	if seo.charset != 'utf-8' and seo.charset != 'utf8':
	response_text_decoded = response_bytes.decode(seo.charset)
	seo = SeoParser(default_charset = 'utf-8')
	seo.feed(response_text_decoded)
	print(seo.get_seo_dict())