Created
March 5, 2024 20:18
-
-
Save berggren/b744a3ac7510207db6a2951e91312121 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from markdownify import markdownify | |
from bs4 import BeautifulSoup | |
from collections.abc import Generator | |
from api.datastores.chroma import VectorStore | |
BASE_URL = "https://forensics.wiki/tags/" | |
def crawler() -> Generator[str]: | |
"""Crawl the Forensics Wiki and return the links to the articles""" | |
response = requests.get(BASE_URL) | |
html = BeautifulSoup(response.content, "html.parser") | |
article = html.find_all("article")[0] | |
lists = article.find_all("li") | |
for list in lists: | |
link = list.find("a") | |
if link: | |
yield BASE_URL + link.get("href") | |
def to_markdown(link) -> str: | |
"""Fetch and convert the article to markdown""" | |
response = requests.get(link) | |
parsed_page = BeautifulSoup(response.content, "html.parser") | |
article = parsed_page.find_all("article")[0] | |
# Remove the "nav" element | |
nav = article.find("nav") | |
nav.extract() | |
markdown = markdownify(str(article)) | |
return markdown | |
if __name__ == "__main__": | |
vector_store = VectorStore(collection_name="forensics") | |
docs = [] | |
metadatas = [] | |
ids = [] | |
links = list(crawler()) | |
for index, link in enumerate(links): | |
markdown = to_markdown(link) | |
link = link.replace("tags/../", "") | |
print(f"Fetching and converting: [{index}/{len(links)}] {link}") | |
docs.append(markdown) | |
metadatas.append({"source": link}) | |
ids.append(str(index)) | |
vector_store.embed_documents(docs, metadatas, ids, refresh=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment