Skip to content

Instantly share code, notes, and snippets.

@vadimkantorov
Created February 28, 2024 16:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vadimkantorov/35e90d61fc6d4d0e73ce1b14669caf08 to your computer and use it in GitHub Desktop.
Save vadimkantorov/35e90d61fc6d4d0e73ce1b14669caf08 to your computer and use it in GitHub Desktop.
Fetches meta/og social media tags from a URL (based on https://gist.github.com/vstoykov/6028987 and upgraded for python3)
# based on https://gist.github.com/vstoykov/6028987
# python socialmediacard.py 'https://meduza.io/feature/2024/02/28/ya-sdelayu-vse-chtoby-zlo-otstupilo-a-prekrasnoe-buduschee-prishlo'
import html.parser
import urllib.request
class SeoParser(html.parser.HTMLParser):
CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4')
ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br')
def __init__(self, default_charset='utf-8'):
super().__init__()
self._last_tag = None
self._in_head = False
self._in_title = False
self._in_body = False
self._content_stack = 0
self._in_content = False
self._in_content_tag = False
self.charset = default_charset
self.title = None
self.meta_tags = {}
self.og_tags = {}
self.content = ''
def handle_starttag(self, tag, attrs):
""" Choose what to do when open a tag """
attrs = dict(attrs)
tag = tag.lower()
if self._in_head:
if tag == 'title':
self._in_title = True
elif tag == 'meta':
if 'charset' in attrs:
self.charset = attrs['charset']
else:
meta_property = attrs.get('property') or ''
meta_name = attrs.get('name') or ''
meta_content = attrs.get('content')
if meta_property.startswith('og:'):
self.og_tags[meta_property[3:]] = meta_content
elif meta_name:
self.meta_tags[meta_name] = meta_content
elif self._in_body:
if self._in_content:
if tag in self.CONTENT_TAGS:
self._in_content_tag = True
self.content += "<%s>" % tag
elif tag in ['div', 'section', 'article']:
tag_class = (attrs.get('class') or '').lower()
tag_id = (attrs.get('id') or '').lower()
if 'content' in tag_class or 'body' in tag_class:
self._in_content = True
elif 'content' in tag_id:
self._in_content = True
elif tag == 'head':
self._in_head = True
elif tag == 'body':
self._in_head = False
self._in_body = True
if self._in_content:
self._content_stack += 1
self._last_tag = tag
def handle_data(self, data):
""" Chose what to do with text nodes """
data = data.strip()
if not data:
return
if self._in_title:
# TODO: Remove new lines in title
self.title = data
if self._in_content_tag:
# Add space before content in some circumstances
if self.content[-1] not in '> ' and data[0] not in ',."':
data = ' ' + data
self.content += data
def handle_endtag(self, tag):
""" Choose what to do when close a tag """
tag = tag.lower()
if self._in_head:
if self._in_title:
self._in_title = False
if tag == 'head':
self._in_head = False
elif self._in_body:
if self._in_content:
if tag in self.CONTENT_TAGS:
self._in_content_tag = False
self.content += "</%s>\n" % tag
self._content_stack -= 1
if tag == 'body':
self._content_stack = 0
self._in_body = False
if self._content_stack == 0:
self._in_content = False
def get_seo_dict(self, translate = {ord('\xa0') : ' ' }):
return dict(title = seo.title.translate(translate), content = seo.content.translate(translate), meta_tags = {k : v.translate(translate) for k, v in self.meta_tags.items()}, og_tags = {k : v.translate(translate) for k, v in self.og_tags.items()})
if __name__ == '__main__':
import sys
url = sys.argv[1]
response_bytes = urllib.request.urlopen(url).read()
response_text_utf8 = response_bytes.decode('utf-8')
seo = SeoParser(default_charset = 'utf-8')
seo.feed(response_text_utf8)
if seo.charset != 'utf-8' and seo.charset != 'utf8':
response_text_decoded = response_bytes.decode(seo.charset)
seo = SeoParser(default_charset = 'utf-8')
seo.feed(response_text_decoded)
print(seo.get_seo_dict())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment