Skip to content

Instantly share code, notes, and snippets.

@feliche93
Created August 28, 2023 08:15
Show Gist options
  • Save feliche93/8dc78e1c0a2bd97408f2288ce46b564e to your computer and use it in GitHub Desktop.
Save feliche93/8dc78e1c0a2bd97408f2288ce46b564e to your computer and use it in GitHub Desktop.
Browserless.io Next.js Edge Route, scraping Website Content
import { SScrapingResult, SWebsiteInfoInput, SWebsiteInfoOutput } from '@lib/zod-models';
import { NextResponse } from 'next/server';
import { z } from 'zod';
export const runtime = 'edge'
export async function POST(request: Request) {
const data = await request.json();
const startTime = Date.now();
const parsedData = SWebsiteInfoInput.parse(data);
const apiTOken = process.env.BROWSERLESS_API_TOKEN;
if (!apiTOken) throw new Error("No BROWSERLESS_API_TOKEN environment variable set");
const url = `https://chrome.browserless.io/scrape?token=${apiTOken}`;
const scrapingUrl = parsedData.url;
const keyword = parsedData.keyword;
const body = {
"url": scrapingUrl,
"elements": [
{
"selector": "body",
"timeout": 0
},
{
"selector": "title",
"timeout": 0
},
{
"selector": "meta[property='og:title']",
"timeout": 0
},
{
"selector": "meta[property='og:description']",
"timeout": 0
},
{
"selector": "meta[property='og:image']",
"timeout": 0
},
{
"selector": "link[rel='icon']",
"timeout": 0
}
]
};
const response = await fetch(url, {
method: 'POST',
body: JSON.stringify(body),
headers: { 'Content-Type': 'application/json' }
});
console.log(`Fetch completed in ${(Date.now() - startTime) / 1000} seconds`);
if (!response.ok) throw new Error("Error in fetch request");
const result = await response.json();
function transformToWebsiteInfoOutput(parsedResult: z.infer<typeof SScrapingResult>) {
// Initialize empty result
let output: Partial<z.infer<typeof SWebsiteInfoOutput>> = {};
// Loop over each data item in parsedResult
for (const item of parsedResult.data) {
if (item.selector === "body") {
output.bodyText = item.results[0]?.text;
} else if (item.selector === "title") {
output.pageTitle = item.results[0]?.text;
} else {
const attr = item.results[0]?.attributes?.find(a => a.name === "content");
if (attr) {
if (item.selector === "meta[property='og:title']") {
output.metaTitle = attr.value;
} else if (item.selector === "meta[property='og:description']") {
output.metaDescription = attr.value;
} else if (item.selector === "meta[property='og:image']") {
output.metaImageUrl = attr.value;
} else if (item.selector === "link[rel='icon']") {
output.faviconImageUrl = attr.value;
}
}
}
}
output.url = scrapingUrl;
keyword && (output.keyword = keyword);
return output;
}
// Parse the result into our SScrapingResult schema
const parsedResult = SScrapingResult.parse(result);
// Transform the parsed result into our target SWebsiteInfoOutput schema
const transformedResult = transformToWebsiteInfoOutput(parsedResult);
// Now you can use SWebsiteInfoOutput to parse and validate the transformed result
const websiteInfoOutput = SWebsiteInfoOutput.parse(transformedResult);
// console.log(JSON.stringify(websiteInfoOutput, null, 2));
return NextResponse.json(websiteInfoOutput);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment