Skip to content

Instantly share code, notes, and snippets.

@AlmogBaku
Last active July 7, 2024 10:00
Show Gist options
  • Save AlmogBaku/baffe45fed8188a28c6238806ae6ae58 to your computer and use it in GitHub Desktop.
Save AlmogBaku/baffe45fed8188a28c6238806ae6ae58 to your computer and use it in GitHub Desktop.
Medium to Markdown
import mimetypes
import os
import markdownify
import requests
mimetypes.init()
class MediumConverter(markdownify.MarkdownConverter):
def convert_figure(self, el, text, convert_as_inline):
img_element = el.find('img')
caption_element = el.find('figcaption')
if img_element:
src = img_element.attrs.get('src', None) or el.select_one('source').attrs['srcset'].split()[0]
caption = caption_element.get_text(strip=True) if caption_element else ''
alt = (img_element.attrs.get('alt', None) or
img_element.attrs.get('title', None) or
caption_element.get_text(strip=True) or '')
if src:
return f'![{alt}]({src})\n*{caption}*' if caption else f'![{alt}]({src})'
return ''
def convert_soup(self, soup):
section = soup.select_one('section')
ignored = section.select('.speechify-ignore')
for i in ignored:
i.decompose()
hr = section.select('div[role="separator"]')
for i in hr:
# replace with hr tag
i.replace_with(soup.new_tag('hr'))
if self.save:
if not self.name:
self.name = (
soup.select_one('title')
.get_text(strip=True)
.split('|')[0].strip()
.replace(' ', '_', -1).replace(':', '_', -1)
.replace('?', '', -1).replace('!', '', -1)
.replace('’', '', 1).replace('‘', '', -1)
.replace('“', '', -1).replace('”', '', -1)
)
if self.name:
self.base_save_path += self.name + '/'
os.makedirs(self.base_save_path, exist_ok=True)
converted = super().convert_soup(section)
if self.save:
filename = 'output.md'
with open(f"{self.base_save_path}{filename}", 'w') as f:
f.write(converted)
print(f'Document saved as {self.base_save_path}{filename}')
return converted
image_i = 0
def convert_img(self, el, text, convert_as_inline):
if self.save:
src = el.get('src')
if not src:
src = el.parent.select('source')[-1].get('srcset').split()[0]
img_resp = requests.get(src)
ext = mimetypes.guess_extension(img_resp.headers['Content-Type'])
filename = f"{self.image_i}{ext}"
self.image_i += 1
with open(f"{self.base_save_path}{filename}", 'wb') as f:
f.write(img_resp.content)
print(f'Image saved as {self.base_save_path}{filename}')
el.attrs['src'] = filename
return super().convert_img(el, text, convert_as_inline)
def __init__(self, save=False, name=None, *args, **kwargs):
self.save = save
self.name = name
if save:
self.base_save_path = 'out/'
os.makedirs(self.base_save_path, exist_ok=True)
super().__init__(heading_style=markdownify.ATX, *args, **kwargs)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Medium to Markdown\n",
"This notebook convert a given medium article to markdown format."
],
"id": "3eb10085964fdf50"
},
{
"metadata": {},
"cell_type": "code",
"source": "!pip install markdownify bs4 requests",
"id": "initial_id",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## Cookies\n",
"To bypass the paywall or to get access to draft articles, you need to provide the cookies.\n",
"\n",
"The cookies available in javascript don't contain all the necessary information, so you should copy it from the request headers by inspecting the network tab in the developer tools, then serialize them using the code below:\n",
"\n",
"```javascript\n",
"var cookiesHeader = 'cookie1=value1; cookie2=value2; cookie3=value3'; // paste the Cookies Header value here\n",
"\n",
"var cookieMap = Object.fromEntries(cookiesHeader.split(';').map(cookie => {\n",
" const [name, value] = cookie.split('=');\n",
" return [name.trim(), value.trim()];\n",
"}));\n",
"console.log(JSON.stringify(cookieMap));\n",
"```\n",
"\n",
"After that, you can copy the output and paste it in the cell below."
],
"id": "2ccfd525333f0b3d"
},
{
"metadata": {},
"cell_type": "code",
"source": "cookies = {} # optional: paste the cookies here",
"id": "e9bbdcd918950f1c",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## Convert to markdown\n",
"The code below will convert the article content to markdown."
],
"id": "41b65e845c337050"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"import requests\n",
"from MediumConverter import MediumConverter\n",
"\n",
"url = \"https://towardsdatascience.com/building-llm-apps-a-clear-step-by-step-guide-1fe1e6ef60fd\" # for draft articles, use the preview URL\n",
"name = 'llm-development-proccess'\n",
"\n",
"resp = requests.get(url, cookies=cookies, headers={'User-Agent': 'Mozilla/5.0'})\n",
"converter = MediumConverter(save=True, name=name)\n",
"md = converter.convert(resp.text)"
],
"id": "b6e196daa9592311",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## Display the markdown content\n",
"The markdown content will be displayed below."
],
"id": "c3520ea3ce59d6e3"
},
{
"metadata": {},
"cell_type": "code",
"source": "print(md) # display the Markdown code",
"id": "e2b26309c5337aef",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"from IPython import display\n",
"\n",
"display.display_markdown(md, raw=True) # display the markdown content"
],
"id": "cb506361c5dfcbad",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment