Created
December 28, 2024 14:19
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { chromium } from 'playwright'; | |
import { fileURLToPath } from 'url'; | |
import dotenv from 'dotenv'; | |
import { searchParser } from './convert-search.mjs'; | |
dotenv.config({ path: '.env.local' }); | |
/** | |
* Scrapes content from a given URL using Playwright with CDP connection | |
* @param {string} url - The URL to scrape | |
* @param {string} format - The output format ('html' or 'text') | |
* @returns {Promise<Object>} - The scraped content | |
*/ | |
async function scrapeUrl(url, format = 'html') { | |
if (!process.env.BRIGHT_PLAYWRIGHT_URL) { | |
throw new Error('BRIGHT_PLAYWRIGHT_URL environment variable is not set'); | |
} | |
const browser = await chromium.connectOverCDP(process.env.BRIGHT_PLAYWRIGHT_URL); | |
try { | |
const context = await browser.newContext(); | |
const page = await context.newPage(); | |
await page.goto(url, { timeout: 60000 }); | |
await page.waitForLoadState('networkidle'); | |
if (format === 'text') { | |
const text = await page.textContent('body'); | |
return { text }; | |
} else { | |
const html = await page.content(); | |
return { html }; | |
} | |
} catch (error) { | |
console.error('Error during scraping:', error); | |
throw error; | |
} finally { | |
await browser.close(); | |
} | |
} | |
/** | |
* Gets Google search results HTML | |
* @param {string} query - Search query | |
* @param {string} baseUrl - Base URL for Google search | |
* @returns {Promise<string>} - HTML content | |
*/ | |
async function getGoogleHtml(query = '', baseUrl = 'https://www.google.com/search?q=') { | |
const browser = await chromium.connectOverCDP(process.env.BRIGHT_PLAYWRIGHT_URL); | |
const searchUrl = baseUrl + encodeURIComponent(query); | |
try { | |
const context = await browser.newContext(); | |
const page = await context.newPage(); | |
await page.goto(searchUrl); | |
const bodyContent = await page.evaluate(() => { | |
const body = document.body; | |
if (!body) return ''; | |
// Remove all script tags | |
const scripts = body.getElementsByTagName('script'); | |
while (scripts.length > 0) { | |
scripts[0].parentNode.removeChild(scripts[0]); | |
} | |
// Remove all style tags | |
const styles = body.getElementsByTagName('style'); | |
while (styles.length > 0) { | |
styles[0].parentNode.removeChild(styles[0]); | |
} | |
// Remove all noscript tags | |
const noscripts = body.getElementsByTagName('noscript'); | |
while (noscripts.length > 0) { | |
noscripts[0].parentNode.removeChild(noscripts[0]); | |
} | |
return body.innerHTML; | |
}); | |
return bodyContent; | |
} catch (error) { | |
console.error('Error fetching page:', error); | |
throw error; | |
} finally { | |
await browser.close(); | |
} | |
} | |
/** | |
* Shows the content of a given URL in specified format | |
* @param {string} format - The output format ('html' or 'text') | |
* @param {string} url - The URL to show content from | |
* @returns {Promise<void>} | |
*/ | |
async function showContent(format, url) { | |
try { | |
const result = await scrapeUrl(url, format); | |
console.log(result[format]); | |
} catch (error) { | |
console.error('Error showing content:', error); | |
throw error; | |
} | |
} | |
/** | |
* Performs a Google search and shows the results | |
* @param {string} query - Search query | |
* @returns {Promise<void>} | |
*/ | |
async function search(query) { | |
try { | |
console.log('Getting google html'); | |
const html = await getGoogleHtml(query); | |
console.log("parsing html"); | |
const parsed = await searchParser(html); | |
console.log(parsed.choices[0].message.content); | |
} catch (error) { | |
console.error('Search error:', error); | |
throw error; | |
} | |
} | |
// Main function to handle command line usage | |
async function main() { | |
const [command, ...args] = process.argv.slice(2); | |
if (!command) { | |
console.error('Please provide a command.'); | |
console.error('Usage:'); | |
console.error(' pnpm run tool scrape <html|text> <url>'); | |
console.error(' pnpm run tool search <query>'); | |
process.exit(1); | |
} | |
try { | |
switch (command.toLowerCase()) { | |
case 'scrape': | |
if (args.length < 2) { | |
console.error('Please provide the format and URL to scrape.'); | |
console.error('Usage: pnpm run tool scrape <html|text> <url>'); | |
process.exit(1); | |
} | |
const format = args[0].toLowerCase(); | |
if (format !== 'html' && format !== 'text') { | |
console.error('Invalid format. Use either "html" or "text".'); | |
process.exit(1); | |
} | |
await showContent(format, args[1]); | |
break; | |
case 'search': | |
if (args.length < 1) { | |
console.error('Please provide a search query.'); | |
console.error('Usage: pnpm run tool search <query>'); | |
process.exit(1); | |
} | |
await search(args.join(' ')); | |
break; | |
default: | |
console.error('Invalid command. Use either "scrape" or "search".'); | |
console.error('Usage:'); | |
console.error(' pnpm run tool scrape <html|text> <url>'); | |
console.error(' pnpm run tool search <query>'); | |
process.exit(1); | |
} | |
} catch (error) { | |
console.error('Operation failed:', error); | |
process.exit(1); | |
} | |
} | |
// Run main function if this file is run directly | |
if (process.argv[1] === fileURLToPath(import.meta.url)) { | |
main(); | |
} | |
export { scrapeUrl, getGoogleHtml }; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment