Skip to content

Instantly share code, notes, and snippets.

@bborn
Created February 7, 2023 20:24
Show Gist options
  • Save bborn/d33634fef0a2b8551a4ac88c514d4a09 to your computer and use it in GitHub Desktop.
Save bborn/d33634fef0a2b8551a4ac88c514d4a09 to your computer and use it in GitHub Desktop.
from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, SimpleWebPageReader
from gpt_index.readers.schema.base import Document
import requests
import tempfile
import magic
import re
import html2text
def remote_url_loader(url, query):
response = requests.get(url)
output = None
docs = []
if response.status_code == 200:
content = response.content
file_type = magic.from_buffer(content, mime=True).split("/")[1]
if file_type == "html":
text = html2text.html2text(content.decode("utf-8"))
docs = [
Document(text)
]
else:
with tempfile.NamedTemporaryFile(suffix=f".{file_type}", delete=False) as f:
f.write(content)
reader = SimpleDirectoryReader(input_files=[f.name])
docs = reader.load_data()
tmp_index = GPTSimpleVectorIndex(docs)
if query != "":
output = str(tmp_index.query(query))
else:
output = str(tmp_index.query(
"Summarize the text.", response_mode="tree_summarize"))
else:
output = (
f"Failed to retrieve file: {response.reason} ({response.status_code})")
return output
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment