Skip to content

Instantly share code, notes, and snippets.

@ujovlado
Forked from sk-t3ch/medium-to-markdown.py
Created January 31, 2022 16:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ujovlado/e16e9326942aac1572adafb213cf0e42 to your computer and use it in GitHub Desktop.
Save ujovlado/e16e9326942aac1572adafb213cf0e42 to your computer and use it in GitHub Desktop.
Medium To MarkDown Script for the https://medium-to-markdown.t3chflicks.org service
import re
from datetime import datetime
import json
from io import BytesIO
import requests
MEDIUM_IMG_CDN = "https://cdn-images-1.medium.com/max/"
def MediumToMarkdownBuilder(request_get):
def _func(url):
medium_post = load_medium_json(request_get)(url)
if (medium_post["type"] != "Post"):
raise Exception('Not a Medium Article')
story = {}
story["title"] = medium_post["title"]
story["date"] = datetime.fromtimestamp(
medium_post["createdAt"]/1000).timestamp()
story["sections"] = medium_post["content"]["bodyModel"]["sections"]
story["paragraphs"] = medium_post["content"]["bodyModel"]["paragraphs"]
sections = [process_section(s) for s in story["sections"]]
if (len(story["paragraphs"]) > 1):
story["subtitle"] = story["paragraphs"][1]["text"]
story["markdown"] = []
story["markdown"].append(
"# " + story["title"].replace(r'/\n/g', '\n# '))
subtitle = story.get("subtitle", False)
if subtitle and subtitle != "":
story["markdown"].append(
"\n" + story["subtitle"].replace(r'/#+/', ''))
sections_len = len(sections)
for para_idx, paragraph in enumerate(story["paragraphs"]):
if para_idx < 2:
continue
if (para_idx < sections_len):
story["markdown"].append(sections[para_idx])
p = story["paragraphs"][para_idx]
processed_paragraph = process_paragraph(request_get)(p)
print(f"{para_idx}#####: " + processed_paragraph)
story["markdown"].append(processed_paragraph)
markdown = '\n'.join(story["markdown"])
return markdown
return _func
def load_medium_json(fetch):
def _func(url):
response = fetch(url + "?format=json")
text = response.text
result = json.loads(text[text.index('{'):])["payload"]["value"]
return result
return _func
def process_section(s):
section = ""
background_image = s.get("backgroundImage", False)
if (background_image):
img_width = int(background_image["originalWidth"])
img_src = f"{MEDIUM_IMG_CDN}{max(img_width * 2, 2000)}/{background_image['id']}"
section = "\n![](" + img_src + ")"
return section
def get_embed(fetch):
def _func(url):
embed = ""
embed_json = load_medium_json(fetch)(url)
if (embed_json["domain"] in ["www.github.com", "gist.github.com"]):
embed = get_GitHub_embed(fetch)(embed_json)
elif (embed_json["domain"] == "www.youtube.com"):
embed = get_YouTube_embed(embed_json)
else:
print("WTF")
return embed
return _func
def get_GitHub_embed(fetch):
def _func(embed_json):
try:
md_soure_code = ''
if (embed_json["gist"]):
gist = embed_json["gist"]
script_src = f"https://api.github.com/gists/{gist['gistId']}"
gist_json_resp = fetch(script_src)
gist_json = gist_json_resp.json()
for file in gist_json["files"].values():
language = file["language"]
language = language.lower() if language is not None else ""
gist_code_resp = fetch(file["raw_url"])
gist_code = gist_code_resp.text
md_soure_code += ('\n```' + language + '\n')
md_soure_code += gist_code.replace(r'/\t/g', ' ')
md_soure_code += '\n```\n'
if (len(md_soure_code) > 0):
md_soure_code = md_soure_code[:len(md_soure_code) - 1]
return md_soure_code
except Exception as err:
print("ERR: ", err)
return ""
return _func
def get_YouTube_embed(embed_json):
body = embed_json["iframeSrc"]
regex = r"youtube.com%2Fembed%2F([^%]+)%3F"
matches = re.search(regex, body)
if (matches and len(matches.groups()) >= 1):
video_id = matches.groups(1)[0]
return f"<center><iframe width='560' height='315' src ='https://www.youtube.com/embed/${video_id}' frameborder='0' allowfullscreen></iframe></center>"
return f"<iframe src='{body}' frameborder=0></iframe>"
def process_paragraph(fetch):
def _func(p):
markups_array = create_markups_array(p["markups"])
if (len(markups_array)):
previous_index = 0
text = p["text"]
tokens = []
for j_index, markup in enumerate(markups_array):
if (markup is not None):
token = text[previous_index: j_index]
previous_index = j_index
tokens.append(token)
tokens.append(markup)
tokens.append(text[j_index:])
p["text"] = ''.join(tokens)
markup = ""
if p["type"] == 1:
markup = "\n"
elif p["type"] == 2:
p["text"] = "\n# " + p["text"].replace(r'/\n/g', '\n# ')
elif p["type"] == 3:
p["text"] = "\n## " + p["text"].replace(r'/\n/g', '\n## ')
elif p["type"] == 4:
# image & caption
img_width = int(p["metadata"]["originalWidth"])
img_src = f"{MEDIUM_IMG_CDN}{max(img_width * 2, 2000)}/{p['metadata']['id']}"
text = "\n![" + p["text"] + "](" + img_src + ")"
if (p["text"]):
text += "*\n\n" + p["text"] + "*"
p["text"] = text
elif p["type"] == 6:
markup = "> "
elif p["type"] == 7:
# quote
p["text"] = "> # " + p["text"].replace('\n', '\n> # ')
elif p["type"] == 8:
p["text"] = "\n " + p["text"].replace('\n', '\n ')
elif p["type"] == 9:
markup = "\n* "
elif p["type"] == 10:
markup = "\n1. "
elif p["type"] == 11:
mediaURL = f"https://medium.com/media/{p['iframe']['mediaResourceId']}"
embed = get_embed(fetch)(mediaURL)
# print("EMBED: ", embed)
return f"\n{ embed }"
elif p["type"] == 13:
markup = "\n### "
elif p["type"] == 15:
# // caption for section image
p["text"] = "*" + p["text"] + "*"
p["text"] = markup + p["text"]
if (p.get("alignment", False) == 2 and p["type"] != 6 and p["type"] != 7):
p["text"] = "<center>" + p["text"] + "</center>"
return p["text"]
return _func
def add_markup(markups_array, open, close, start, end):
if markups_array[start]:
markups_array[start] += open
else:
markups_array[start] = open
if markups_array[end]:
markups_array[end] += close
else:
markups_array[end] = close
return markups_array
def create_markups_array(markups):
if (not markups or len(markups) == 0):
return []
markups_array = [None] * (max(map(lambda x: x["end"], markups))+1)
for m in markups:
if m["type"] == 1:
# // bold
add_markup(markups_array, "**", "**", m["start"], m["end"])
elif m["type"] == 2:
# // italic
add_markup(markups_array, "*", "*", m["start"], m["end"])
elif m["type"] == 3:
# // anchor tag
add_markup(markups_array, "[", "](" +
m["href"] + ")", m["start"], m["end"])
elif m["type"] == 8:
# // code tag
add_markup(markups_array, "```", "```", m["start"], m["end"])
elif m["type"] == 10:
# // code tag
add_markup(markups_array, "`", "`", m["start"], m["end"])
else:
print("Unknown markup type " + m["type"], m)
return markups_array
def process_event(event):
record = event['Records'][0]
result = json.loads(record['body'])
return result
MediumToMarkdown = MediumToMarkdownBuilder(requests.get)
url = "https://medium.com/nerd-for-tech/smart-buoy-summary-602f9db544bb"
medium_post = MediumToMarkdown(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment