Last active
March 19, 2023 01:06
-
-
Save Tantalus13A98B5F/0eea45404fe0a2f96f0160f457c6b46e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
from pathlib import Path | |
from contextlib import contextmanager | |
import subprocess as subp | |
# In[2]: | |
rsp = requests.get('http://tiarkrompf.github.io/notes/page-data.json') | |
content = pd.json_normalize(rsp.json()['result']['data']['allMarkdownRemark']['edges']) | |
content.columns = content.columns.str.split('.').map(lambda x: x[-1]) | |
content['slug'] = content['slug'].str.replace('/Public/Generic/', '') | |
content.convert_dtypes().dtypes | |
# In[3]: | |
content['title date slug description'.split()] | |
# In[4]: | |
def parse_soup(content, idx, fmt): | |
assert fmt in ['html', 'md'] | |
soup = BeautifulSoup(content.loc[idx, 'html'], 'lxml') | |
for item in soup.find_all('aside'): | |
item.attrs['popout'] = 'true' | |
if fmt == 'md': | |
for item in soup.find_all(class_='math'): | |
if sub := item.find(class_='katex-mathml'): | |
item.replace_with(sub.find()) | |
def add_utf8(html): | |
head = soup.new_tag('head') | |
head.append(soup.new_tag('meta', charset='utf-8')) | |
html.insert(0, head) | |
cnt_aside = 0 | |
def inc_aside(sub): | |
nonlocal cnt_aside | |
if not (sub.name == 'aside' and 'id' in sub.attrs): | |
cur_aside = cnt_aside + 1 | |
cnt_aside += 1 + len(sub.attrs.get('asides', '').split()) | |
return cur_aside | |
def wrap_aside(sub): | |
newsub = soup.new_tag('html') | |
sub.name = 'body' | |
newsub.append(sub) | |
return newsub | |
def extract_aside(cur_aside, sub): | |
p = soup.new_tag('p') | |
fn = sub.attrs.get('id', f'aside{cur_aside}') | |
a = soup.new_tag('a', href=f'{fn}/index.{fmt}') | |
a.string = sub.find().text | |
p.append(a) | |
sub.replace_with(p) | |
return p, wrap_aside(sub) | |
def ref_code(fname, sub): | |
p = soup.new_tag('p') | |
a = soup.new_tag('a', href=fname) | |
attrs = sub.attrs.copy() | |
attrs.pop('class') | |
a.string = f'{fname} => {attrs}' | |
p.append(a) | |
return p | |
def extract_code(cur_aside, sub): | |
p = ref_code(f'aside{cur_aside}.js', sub) | |
sub.replace_with(p) | |
return p, sub | |
def annotate_inline(idx, sub): | |
sub.name = 'pre' | |
sub.attrs['class'] = 'javascript' | |
p = ref_code(f'inline{idx}.js', sub) | |
sub.insert_before(p) | |
return p, sub | |
path = Path(content.loc[idx, 'slug']) | |
path.mkdir(exist_ok=True) | |
with (path / 'jsconfig.json').open('w') as f: | |
print('{}', file=f) | |
def dump_code(ref, sub): | |
with (path / ref.a.attrs['href']).open('w') as f: | |
f.write(sub.text) | |
@contextmanager | |
def chdir_aside(ref): | |
nonlocal path | |
orig, path = path, (path / ref.a.attrs['href']).parent | |
try: | |
path.mkdir(exist_ok=True) | |
yield | |
finally: | |
path = orig | |
def dump_top(top): | |
if fmt == 'html': | |
with (path / 'index.html').open('w') as f: | |
f.write(str(top)) | |
elif fmt == 'md': | |
cmd = 'pandoc -f html -t gfm --wrap auto --columns 80 -o'.split() | |
subp.run([*cmd, path / 'index.md'], input=str(top), text=True) | |
def recur(top): | |
add_utf8(top) | |
while sub := top.find(popout='true'): | |
cur = inc_aside(sub) | |
sub.attrs.pop('popout') | |
if sub.name == 'code': | |
ref, sub = extract_code(cur, sub) | |
dump_code(ref, sub) | |
elif sub.name == 'aside': | |
ref, sub = extract_aside(cur, sub) | |
with chdir_aside(ref): | |
recur(sub) | |
for idx, sub in enumerate(top.find_all('code', class_='runScript')): | |
ref, sub = annotate_inline(idx, sub) | |
dump_code(ref, sub) | |
dump_top(top) | |
recur(soup.html) | |
# In[5]: | |
for i in content.index: | |
parse_soup(content, i, 'md') | |
# In[ ]: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment