Skip to content

Instantly share code, notes, and snippets.

@feliche93
Last active August 28, 2023 08:38
Show Gist options
  • Save feliche93/eac74c77db4227069891f66fef20d370 to your computer and use it in GitHub Desktop.
Save feliche93/eac74c77db4227069891f66fef20d370 to your computer and use it in GitHub Desktop.
FastAPI App for Webscraping on Modal.com
"""
This module defines the FastAPI application and its endpoints.
It includes the endpoint for scraping a website and potentially an endpoint for finding contacts.
The application is wrapped with a stub function for deployment.
"""
from typing import Any
from common import ENV, image, secret, stub
from fastapi import FastAPI
from modal import asgi_app
from models import ScrapeWebsiteRequest, WebsiteContentOutputSchema
from scraper import get_website_content
web_app = FastAPI()
@web_app.post("/scrape-website", response_model=WebsiteContentOutputSchema)
async def scrape_website(request: ScrapeWebsiteRequest) -> Any:
"""
This function scrapes the website content based on the provided URL.
Args:
request (ScrapeWebsiteRequest): The request object containing
the URL of the website to be scraped.
Returns:
WebsiteContentOutputSchema: The response object containing the scraped website content.
"""
content = await get_website_content(request.url)
if request.keyword:
content = content.copy(update={"keyword": request.keyword})
return content
@stub.function(image=image, secret=secret)
@asgi_app(label=f"backlinkgpt-fast-api-{ENV}")
def fastapi_app():
"""
This function returns the FastAPI application instance.
Returns:
FastAPI: The FastAPI application instance.
"""
return web_app
"""
This module contains functions for web scraping,
including converting HTML content to BeautifulSoup objects and markdown,
and getting website content using playwright.
"""
from bs4 import BeautifulSoup
from html2text import HTML2Text
from langchain.tools import tool
from models import GetWebsiteContentSchema, WebsiteContentOutputSchema
from playwright.async_api import async_playwright
def convert_content_to_soup(content: str) -> BeautifulSoup:
"""Convert html content to soup
Args:
content (str): html content
Returns:
BeautifulSoup: soup
"""
soup = BeautifulSoup(content, "html.parser")
return soup
def convert_content_to_markdown(content: str) -> str:
"""Convert soup to markdown
Args:
soup (BeautifulSoup): soup
Returns:
str: markdown
"""
text_maker = HTML2Text()
markdown = text_maker.handle(content)
return markdown
@tool(return_direct=False, args_schema=GetWebsiteContentSchema)
async def get_website_content(url: str) -> WebsiteContentOutputSchema:
"""Use this to get the text content of a website."""
async with async_playwright() as p: # pylint: disable=invalid-name
# can be used for local debugging in jupyter notebook
# p = await async_playwright().start()
# browser = await p.chromium.launch(headless=False)
browser = await p.chromium.launch()
page = await browser.new_page()
print(f"Goto {url}")
await page.goto(url)
# get page content
content = await page.content()
await browser.close()
# parse with BeautifulSoup
soup = convert_content_to_soup(content)
# body_text
body_text = convert_content_to_markdown(content=content)
# page_title
page_title = soup.find("title").text
# meta_title
meta_title = soup.find("meta", property="og:title")
meta_title = meta_title["content"] if meta_title else None
# meta_description
meta_description = soup.find("meta", property="og:description")
meta_description = meta_description["content"] if meta_description else None
# meta_image_url
meta_image_url = soup.find("meta", property="og:image")
meta_image_url = meta_image_url["content"] if meta_image_url else None
# favicon_image_url
favicon_image_url = soup.find("link", rel="icon")
favicon_image_url = url + favicon_image_url["href"] if favicon_image_url else None
print(f"Crawled {url}")
return WebsiteContentOutputSchema(
bodyText=body_text,
pageTitle=page_title,
metaTitle=meta_title,
metaDescription=meta_description,
metaImageUrl=meta_image_url,
faviconImageUrl=favicon_image_url,
url=url,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment