Instantly share code, notes, and snippets.
Created
February 28, 2024 16:00
-
Save vadimkantorov/35e90d61fc6d4d0e73ce1b14669caf08 to your computer and use it in GitHub Desktop.
Fetches meta/og social media tags from a URL (based on https://gist.github.com/vstoykov/6028987 and upgraded for python3)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# based on https://gist.github.com/vstoykov/6028987 | |
# python socialmediacard.py 'https://meduza.io/feature/2024/02/28/ya-sdelayu-vse-chtoby-zlo-otstupilo-a-prekrasnoe-buduschee-prishlo' | |
import html.parser | |
import urllib.request | |
class SeoParser(html.parser.HTMLParser): | |
CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4') | |
ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br') | |
def __init__(self, default_charset='utf-8'): | |
super().__init__() | |
self._last_tag = None | |
self._in_head = False | |
self._in_title = False | |
self._in_body = False | |
self._content_stack = 0 | |
self._in_content = False | |
self._in_content_tag = False | |
self.charset = default_charset | |
self.title = None | |
self.meta_tags = {} | |
self.og_tags = {} | |
self.content = '' | |
def handle_starttag(self, tag, attrs): | |
""" Choose what to do when open a tag """ | |
attrs = dict(attrs) | |
tag = tag.lower() | |
if self._in_head: | |
if tag == 'title': | |
self._in_title = True | |
elif tag == 'meta': | |
if 'charset' in attrs: | |
self.charset = attrs['charset'] | |
else: | |
meta_property = attrs.get('property') or '' | |
meta_name = attrs.get('name') or '' | |
meta_content = attrs.get('content') | |
if meta_property.startswith('og:'): | |
self.og_tags[meta_property[3:]] = meta_content | |
elif meta_name: | |
self.meta_tags[meta_name] = meta_content | |
elif self._in_body: | |
if self._in_content: | |
if tag in self.CONTENT_TAGS: | |
self._in_content_tag = True | |
self.content += "<%s>" % tag | |
elif tag in ['div', 'section', 'article']: | |
tag_class = (attrs.get('class') or '').lower() | |
tag_id = (attrs.get('id') or '').lower() | |
if 'content' in tag_class or 'body' in tag_class: | |
self._in_content = True | |
elif 'content' in tag_id: | |
self._in_content = True | |
elif tag == 'head': | |
self._in_head = True | |
elif tag == 'body': | |
self._in_head = False | |
self._in_body = True | |
if self._in_content: | |
self._content_stack += 1 | |
self._last_tag = tag | |
def handle_data(self, data): | |
""" Chose what to do with text nodes """ | |
data = data.strip() | |
if not data: | |
return | |
if self._in_title: | |
# TODO: Remove new lines in title | |
self.title = data | |
if self._in_content_tag: | |
# Add space before content in some circumstances | |
if self.content[-1] not in '> ' and data[0] not in ',."': | |
data = ' ' + data | |
self.content += data | |
def handle_endtag(self, tag): | |
""" Choose what to do when close a tag """ | |
tag = tag.lower() | |
if self._in_head: | |
if self._in_title: | |
self._in_title = False | |
if tag == 'head': | |
self._in_head = False | |
elif self._in_body: | |
if self._in_content: | |
if tag in self.CONTENT_TAGS: | |
self._in_content_tag = False | |
self.content += "</%s>\n" % tag | |
self._content_stack -= 1 | |
if tag == 'body': | |
self._content_stack = 0 | |
self._in_body = False | |
if self._content_stack == 0: | |
self._in_content = False | |
def get_seo_dict(self, translate = {ord('\xa0') : ' ' }): | |
return dict(title = seo.title.translate(translate), content = seo.content.translate(translate), meta_tags = {k : v.translate(translate) for k, v in self.meta_tags.items()}, og_tags = {k : v.translate(translate) for k, v in self.og_tags.items()}) | |
if __name__ == '__main__': | |
import sys | |
url = sys.argv[1] | |
response_bytes = urllib.request.urlopen(url).read() | |
response_text_utf8 = response_bytes.decode('utf-8') | |
seo = SeoParser(default_charset = 'utf-8') | |
seo.feed(response_text_utf8) | |
if seo.charset != 'utf-8' and seo.charset != 'utf8': | |
response_text_decoded = response_bytes.decode(seo.charset) | |
seo = SeoParser(default_charset = 'utf-8') | |
seo.feed(response_text_decoded) | |
print(seo.get_seo_dict()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment