Skip to content

Instantly share code, notes, and snippets.

@AlmogBaku
Last active July 7, 2024 10:00
Show Gist options
  • Save AlmogBaku/baffe45fed8188a28c6238806ae6ae58 to your computer and use it in GitHub Desktop.
Save AlmogBaku/baffe45fed8188a28c6238806ae6ae58 to your computer and use it in GitHub Desktop.
Medium to Markdown
import mimetypes
import os
import markdownify
import requests
mimetypes.init()
class MediumConverter(markdownify.MarkdownConverter):
def convert_figure(self, el, text, convert_as_inline):
img_element = el.find('img')
caption_element = el.find('figcaption')
if img_element:
src = img_element.attrs.get('src', None) or el.select_one('source').attrs['srcset'].split()[0]
caption = caption_element.get_text(strip=True) if caption_element else ''
alt = (img_element.attrs.get('alt', None) or
img_element.attrs.get('title', None) or
caption_element.get_text(strip=True) or '')
if src:
return f'![{alt}]({src})\n*{caption}*' if caption else f'![{alt}]({src})'
return ''
def convert_soup(self, soup):
section = soup.select_one('section')
ignored = section.select('.speechify-ignore')
for i in ignored:
i.decompose()
hr = section.select('div[role="separator"]')
for i in hr:
# replace with hr tag
i.replace_with(soup.new_tag('hr'))
if self.save:
if not self.name:
self.name = (
soup.select_one('title')
.get_text(strip=True)
.split('|')[0].strip()
.replace(' ', '_', -1).replace(':', '_', -1)
.replace('?', '', -1).replace('!', '', -1)
.replace('’', '', 1).replace('‘', '', -1)
.replace('“', '', -1).replace('”', '', -1)
)
if self.name:
self.base_save_path += self.name + '/'
os.makedirs(self.base_save_path, exist_ok=True)
converted = super().convert_soup(section)
if self.save:
filename = 'output.md'
with open(f"{self.base_save_path}{filename}", 'w') as f:
f.write(converted)
print(f'Document saved as {self.base_save_path}{filename}')
return converted
image_i = 0
def convert_img(self, el, text, convert_as_inline):
if self.save:
src = el.get('src')
if not src:
src = el.parent.select('source')[-1].get('srcset').split()[0]
img_resp = requests.get(src)
ext = mimetypes.guess_extension(img_resp.headers['Content-Type'])
filename = f"{self.image_i}{ext}"
self.image_i += 1
with open(f"{self.base_save_path}{filename}", 'wb') as f:
f.write(img_resp.content)
print(f'Image saved as {self.base_save_path}{filename}')
el.attrs['src'] = filename
return super().convert_img(el, text, convert_as_inline)
def __init__(self, save=False, name=None, *args, **kwargs):
self.save = save
self.name = name
if save:
self.base_save_path = 'out/'
os.makedirs(self.base_save_path, exist_ok=True)
super().__init__(heading_style=markdownify.ATX, *args, **kwargs)
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment