Skip to content

Instantly share code, notes, and snippets.

@DaveOkpare
Last active June 6, 2024 17:17
Show Gist options
  • Save DaveOkpare/f9d0321d941637ff081ce2fb311d2fcd to your computer and use it in GitHub Desktop.
Save DaveOkpare/f9d0321d941637ff081ce2fb311d2fcd to your computer and use it in GitHub Desktop.
A script that scrapes a website and produces structured output from it.

Here's a sample request and response.

curl --location '127.0.0.1:8000/process/' \
--header 'Content-Type: application/json' \
--data-raw '{"url": "https://news.ycombinator.com/item?id=40224213",
"prompt": "Extract 20 jobs",
"xpath": "//div[@class='\''comment'\'']",
"fields": {
    "company_name": "str",
    "location": "str",
    "skills": "str",
    "contact": "str",
    "index": "int"
}
}'

The immediate response will be a message that's it's processing the URL, however the background task scrapes the webpage and processes the output.

Here are two responses.

  1. Server Response
{"message":"Processing URL"}
  1. Background Task Response
data=[DataModel(company_name='Internet Archive', location='Remote PT-ET Hours, USA', skills='Web crawlers, preservation, public access services', contact='https://app.trinethire.com/companies/32967-internet-archive/..', index=1), DataModel(company_name='Common Crawl Foundation', location='REMOTE', skills='Python, Java, cloud systems, Spark/PySpark', contact='jobs@commoncrawl.org', index=2), DataModel(company_name='PostHog', location='REMOTE (GMT-8 to GMT+1)', skills='Product engineers, ex technical founders, engineers with clickhouse experience', contact='posthog.com/careers', index=3), DataModel(company_name='Overleaf', location='REMOTE, UK, Germany, Spain, Romania', skills='Senior Full Stack JS Engineers, backend, frontend', contact='https://digitalscience.pinpointhq.com/postings/43b03318-3e96...', index=4), DataModel(company_name='Kanary', location='US-Only', skills='Python, JS, Go', contact='rachel[at]kanary[dot]com', index=5), DataModel(company_name='SerpApi', location='REMOTE, Austin, TX', skills='Fullstack Engineer, Customer Success Engineer, Ruby, Rails, MongoDB, React.JS', contact='https://serpapi.com/careers', index=6), DataModel(company_name='Rinse', location='REMOTE', skills='Logistics, Django, Python, Optimization, React, React Native, Postgres, Mobile Engineer', contact='jobs@rinse.com', index=7), DataModel(company_name='Y Combinator', location='San Francisco, CA ONSITE', skills='Ruby on Rails, React/Typescript', contact='casey@ycombinator.com', index=8), DataModel(company_name='LiveEO', location='Hybrid or REMOTE in EU', skills='Data Engineer, Full Stack Engineers, Backend Engineers, Data Scientists', contact='sven dot mesecke @ company domain', index=9), DataModel(company_name='Reef Technologies', location='REMOTE', skills='Backend, Python', contact='https://careers.reef.pl/', index=10), DataModel(company_name='Rune Labs', location='REMOTE - US, Canada', skills='iOS Engineers', contact='https://boards.greenhouse.io/runelabs', index=11), DataModel(company_name='Sentry.io', location='REMOTE, San Francisco, Seattle, Toronto, Vienna, NYC & Boulder', skills='Software Engineers, Eng Managers, SRE, debugging context, github, jira, slack', contact='https://sentry.io/careers', index=12), DataModel(company_name='Column', location='San Francisco, CA', skills='Software Eng (React, Go), Software Eng (Infra, Go, Kubernetes, AWS), Junior eng (JS, Go)', contact='https://column.com/careers', index=13), DataModel(company_name='Lovable', location='London / Stockholm Hybrid', skills='Product engineers, CREATIVE, algorithmic', contact='https://lovable.dev/careers', index=14), DataModel(company_name='Arcadia', location='US REMOTE', skills='Senior Software Engineer (energy analytics), Staff Software Engineer, Staff SRE', contact='https://www.arcadia.com/careers', index=15), DataModel(company_name='Sensei Ag', location='Remote within US', skills='Software Engineer', contact='https://boards.greenhouse.io/senseiag/jobs/5966132003', index=16), DataModel(company_name='AuxHealth', location='USA REMOTE', skills='backend+infra SWE, fastapi/python, redis, postgres, azure', contact='ilya@auxhealth.io', index=17), DataModel(company_name='Videx', location='NYC Preferred & Remote', skills='System engineering backgrounds, Rust, C++, C , Golang, Compiler engineer', contact='https://boards.greenhouse.io/monad', index=18), DataModel(company_name='Pickleball Vision AI', location='REMOTE', skills='Senior Python Engineer', contact='hckr[at]pb.vision', index=19), DataModel(company_name='Qventus', location='REMOTE (USA, INDIA)', skills='Software engineers, Sr. Data Platform, Data Eng II, Sr. Data Scientist, Analytics Engineer', contact='https://jobs.ashbyhq.com/qventus?utm_source=DQpNjMYvxb', index=20)]
from typing import Dict, Optional
from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
from utils import extract, scrape
app = FastAPI()
class Body(BaseModel):
url: str
prompt: str
fields: Dict[str, str]
xpath: Optional[str] = None
def process_website(url: str, prompt: str, xpath: str, fields: dict):
context = scrape(url, xpath)
output = extract(prompt, context, fields)
@app.post("/process")
def process_webpage(webpage: Body, bg_task: BackgroundTasks):
bg_task.add_task(
process_website, webpage.url, webpage.prompt, webpage.xpath, webpage.fields
)
return {"message": "Processing URL"}
import requests
from lxml import html
from pydantic import BaseModel, create_model
import instructor
from pydantic import BaseModel
from openai import OpenAI
from typing import List
from dotenv import load_dotenv
load_dotenv()
type_map = {
"str": str,
"int": int,
}
def string_to_type(type_name: str):
return type_map.get(type_name)
def scrape(url: str, xpath_exp: str = None):
session = requests.Session()
response = session.get(url)
tree = html.fromstring(response.text)
text = ""
if xpath_exp:
for article in tree.xpath(xpath_exp):
text += article.text_content()
text += "\n"
else:
text = tree.text_content()
return text
client = instructor.from_openai(OpenAI())
def extract(
sys_msg: str, context: str, fields: dict
):
attributes = {key: (string_to_type(value), ...) for key, value in fields.items()}
datamodel = create_model("DataModel", **attributes)
class MultiOutput(BaseModel):
data: List[datamodel]
website_info = client.chat.completions.create(
model="gpt-4o",
response_model=MultiOutput,
messages=[
{"role": "system", "content": sys_msg},
{"role": "user", "content": context},
],
)
print(website_info)
return website_info
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment