Skip to content

Instantly share code, notes, and snippets.

@vgel
Created November 8, 2023 20:34
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vgel/d2fa6c8261fcc6098f6ecc314b0fe8ba to your computer and use it in GitHub Desktop.
Save vgel/d2fa6c8261fcc6098f6ecc314b0fe8ba to your computer and use it in GitHub Desktop.
ar5iv
# pip install beautifulsoup4 requests markdownify mistletoe
# command line usage: python chunkpaper.py 'https://ar5iv.org/abs/1910.06709'
# will dump the HTML, Markdown, and finally the chunk JSON
# note that (as of Nov 8 '23) ar5iv only has arxiv papers converted up to the end of October,
# but if you need something more recent you could probably do the LaTeX→HTML conversion yourself
# library usage: call `chunk` with some ar5iv HTML. will probably choke on anything else
import dataclasses
import re
from typing import Any
import bs4
import requests
from markdownify import markdownify
import mistletoe
import mistletoe.block_token
import mistletoe.markdown_renderer
MIN_BLOCK_SIZE = 128
MAX_BLOCK_SIZE = 2048
@dataclasses.dataclass
class Chunk:
headings: list[str]
content: str
@dataclasses.dataclass
class ChunkingResult:
html: str
markdown: str
chunks: list[Chunk]
def chunks_jsonable(self) -> list[dict[str, Any]]:
return [dataclasses.asdict(c) for c in self.chunks]
def chunk(html: str) -> ChunkingResult:
soup = bs4.BeautifulSoup(html, features="html.parser")
article = soup.select_one("article")
if article is None:
raise ValueError("missing article")
for math in article.select("math"):
math.replace_with("$" + math.attrs["alttext"].strip() + "$")
for cite in article.select("cite"):
cite.replace_with(cite.text.strip())
for header in article.select("h1,h2,h3,h4,h5,h6"):
text = header.text.strip()
header.clear()
header.append(text)
for a in article.select("a"):
if a.attrs.get("href", "").startswith("data:"):
a.decompose()
elif "title" in a.attrs:
del a.attrs["title"]
markdown = markdownify(str(article))
markdown = re.sub(r"\n{3,}", "\n\n", markdown)
mistletoe.block_token.reset_tokens()
renderer = mistletoe.markdown_renderer.MarkdownRenderer()
parsed = mistletoe.Document(markdown)
current_headings = []
chunks = []
for block in parsed.children:
rendered = renderer.render(block).strip()
if not rendered:
continue
if isinstance(block, mistletoe.block_token.SetextHeading):
current_headings = current_headings[: block.level - 1]
content = rendered.replace(block.underline, "").strip()
current_headings.append(content)
elif isinstance(block, mistletoe.block_token.Heading):
current_headings = current_headings[: block.level - 1]
content = block.children[0].content
current_headings.append(content)
else:
should_merge = (
len(rendered) < MIN_BLOCK_SIZE
and len(chunks) > 0
and chunks[-1].headings == current_headings
and len(chunks[-1].content) + 2 + len(rendered) < MAX_BLOCK_SIZE
)
if should_merge:
chunks[-1] = Chunk(
headings=list(current_headings),
content=chunks[-1].content + "\n\n" + rendered,
)
else:
chunks.append(
Chunk(
headings=list(current_headings),
content=rendered,
)
)
return ChunkingResult(
html=soup.prettify(),
markdown=markdown,
chunks=chunks,
)
if __name__ == "__main__":
import sys, pprint
r = requests.get(sys.argv[1])
r.raise_for_status()
result = chunk(r.text)
print(result.html)
print("\n" * 5 + "-" * 40 + "\n" * 5)
print(result.markdown)
print("\n" * 5 + "-" * 40 + "\n" * 5)
pprint.pprint(result.chunks_jsonable())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment