Skip to content

Instantly share code, notes, and snippets.

@Tantalus13A98B5F
Last active March 19, 2023 01:06
Show Gist options
  • Save Tantalus13A98B5F/0eea45404fe0a2f96f0160f457c6b46e to your computer and use it in GitHub Desktop.
Save Tantalus13A98B5F/0eea45404fe0a2f96f0160f457c6b46e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
from contextlib import contextmanager
import subprocess as subp
# In[2]:
rsp = requests.get('http://tiarkrompf.github.io/notes/page-data.json')
content = pd.json_normalize(rsp.json()['result']['data']['allMarkdownRemark']['edges'])
content.columns = content.columns.str.split('.').map(lambda x: x[-1])
content['slug'] = content['slug'].str.replace('/Public/Generic/', '')
content.convert_dtypes().dtypes
# In[3]:
content['title date slug description'.split()]
# In[4]:
def parse_soup(content, idx, fmt):
assert fmt in ['html', 'md']
soup = BeautifulSoup(content.loc[idx, 'html'], 'lxml')
for item in soup.find_all('aside'):
item.attrs['popout'] = 'true'
if fmt == 'md':
for item in soup.find_all(class_='math'):
if sub := item.find(class_='katex-mathml'):
item.replace_with(sub.find())
def add_utf8(html):
head = soup.new_tag('head')
head.append(soup.new_tag('meta', charset='utf-8'))
html.insert(0, head)
cnt_aside = 0
def inc_aside(sub):
nonlocal cnt_aside
if not (sub.name == 'aside' and 'id' in sub.attrs):
cur_aside = cnt_aside + 1
cnt_aside += 1 + len(sub.attrs.get('asides', '').split())
return cur_aside
def wrap_aside(sub):
newsub = soup.new_tag('html')
sub.name = 'body'
newsub.append(sub)
return newsub
def extract_aside(cur_aside, sub):
p = soup.new_tag('p')
fn = sub.attrs.get('id', f'aside{cur_aside}')
a = soup.new_tag('a', href=f'{fn}/index.{fmt}')
a.string = sub.find().text
p.append(a)
sub.replace_with(p)
return p, wrap_aside(sub)
def ref_code(fname, sub):
p = soup.new_tag('p')
a = soup.new_tag('a', href=fname)
attrs = sub.attrs.copy()
attrs.pop('class')
a.string = f'{fname} => {attrs}'
p.append(a)
return p
def extract_code(cur_aside, sub):
p = ref_code(f'aside{cur_aside}.js', sub)
sub.replace_with(p)
return p, sub
def annotate_inline(idx, sub):
sub.name = 'pre'
sub.attrs['class'] = 'javascript'
p = ref_code(f'inline{idx}.js', sub)
sub.insert_before(p)
return p, sub
path = Path(content.loc[idx, 'slug'])
path.mkdir(exist_ok=True)
with (path / 'jsconfig.json').open('w') as f:
print('{}', file=f)
def dump_code(ref, sub):
with (path / ref.a.attrs['href']).open('w') as f:
f.write(sub.text)
@contextmanager
def chdir_aside(ref):
nonlocal path
orig, path = path, (path / ref.a.attrs['href']).parent
try:
path.mkdir(exist_ok=True)
yield
finally:
path = orig
def dump_top(top):
if fmt == 'html':
with (path / 'index.html').open('w') as f:
f.write(str(top))
elif fmt == 'md':
cmd = 'pandoc -f html -t gfm --wrap auto --columns 80 -o'.split()
subp.run([*cmd, path / 'index.md'], input=str(top), text=True)
def recur(top):
add_utf8(top)
while sub := top.find(popout='true'):
cur = inc_aside(sub)
sub.attrs.pop('popout')
if sub.name == 'code':
ref, sub = extract_code(cur, sub)
dump_code(ref, sub)
elif sub.name == 'aside':
ref, sub = extract_aside(cur, sub)
with chdir_aside(ref):
recur(sub)
for idx, sub in enumerate(top.find_all('code', class_='runScript')):
ref, sub = annotate_inline(idx, sub)
dump_code(ref, sub)
dump_top(top)
recur(soup.html)
# In[5]:
for i in content.index:
parse_soup(content, i, 'md')
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment