Skip to content

Instantly share code, notes, and snippets.

@berggren
Created March 5, 2024 20:18
Show Gist options
  • Save berggren/b744a3ac7510207db6a2951e91312121 to your computer and use it in GitHub Desktop.
Save berggren/b744a3ac7510207db6a2951e91312121 to your computer and use it in GitHub Desktop.
import requests
from markdownify import markdownify
from bs4 import BeautifulSoup
from collections.abc import Generator
from api.datastores.chroma import VectorStore
BASE_URL = "https://forensics.wiki/tags/"
def crawler() -> Generator[str]:
"""Crawl the Forensics Wiki and return the links to the articles"""
response = requests.get(BASE_URL)
html = BeautifulSoup(response.content, "html.parser")
article = html.find_all("article")[0]
lists = article.find_all("li")
for list in lists:
link = list.find("a")
if link:
yield BASE_URL + link.get("href")
def to_markdown(link) -> str:
"""Fetch and convert the article to markdown"""
response = requests.get(link)
parsed_page = BeautifulSoup(response.content, "html.parser")
article = parsed_page.find_all("article")[0]
# Remove the "nav" element
nav = article.find("nav")
nav.extract()
markdown = markdownify(str(article))
return markdown
if __name__ == "__main__":
vector_store = VectorStore(collection_name="forensics")
docs = []
metadatas = []
ids = []
links = list(crawler())
for index, link in enumerate(links):
markdown = to_markdown(link)
link = link.replace("tags/../", "")
print(f"Fetching and converting: [{index}/{len(links)}] {link}")
docs.append(markdown)
metadatas.append({"source": link})
ids.append(str(index))
vector_store.embed_documents(docs, metadatas, ids, refresh=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment