import { chromium } from 'playwright'; import { fileURLToPath } from 'url'; import dotenv from 'dotenv'; import { searchParser } from './convert-search.mjs'; dotenv.config({ path: '.env.local' }); /** * Scrapes content from a given URL using Playwright with CDP connection * @param {string} url - The URL to scrape * @param {string} format - The output format ('html' or 'text') * @returns {Promise<Object>} - The scraped content */ async function scrapeUrl(url, format = 'html') { if (!process.env.BRIGHT_PLAYWRIGHT_URL) { throw new Error('BRIGHT_PLAYWRIGHT_URL environment variable is not set'); } const browser = await chromium.connectOverCDP(process.env.BRIGHT_PLAYWRIGHT_URL); try { const context = await browser.newContext(); const page = await context.newPage(); await page.goto(url, { timeout: 60000 }); await page.waitForLoadState('networkidle'); if (format === 'text') { const text = await page.textContent('body'); return { text }; } else { const html = await page.content(); return { html }; } } catch (error) { console.error('Error during scraping:', error); throw error; } finally { await browser.close(); } } /** * Gets Google search results HTML * @param {string} query - Search query * @param {string} baseUrl - Base URL for Google search * @returns {Promise<string>} - HTML content */ async function getGoogleHtml(query = '', baseUrl = 'https://www.google.com/search?q=') { const browser = await chromium.connectOverCDP(process.env.BRIGHT_PLAYWRIGHT_URL); const searchUrl = baseUrl + encodeURIComponent(query); try { const context = await browser.newContext(); const page = await context.newPage(); await page.goto(searchUrl); const bodyContent = await page.evaluate(() => { const body = document.body; if (!body) return ''; // Remove all script tags const scripts = body.getElementsByTagName('script'); while (scripts.length > 0) { scripts[0].parentNode.removeChild(scripts[0]); } // Remove all style tags const styles = body.getElementsByTagName('style'); while (styles.length > 0) { styles[0].parentNode.removeChild(styles[0]); } // Remove all noscript tags const noscripts = body.getElementsByTagName('noscript'); while (noscripts.length > 0) { noscripts[0].parentNode.removeChild(noscripts[0]); } return body.innerHTML; }); return bodyContent; } catch (error) { console.error('Error fetching page:', error); throw error; } finally { await browser.close(); } } /** * Shows the content of a given URL in specified format * @param {string} format - The output format ('html' or 'text') * @param {string} url - The URL to show content from * @returns {Promise<void>} */ async function showContent(format, url) { try { const result = await scrapeUrl(url, format); console.log(result[format]); } catch (error) { console.error('Error showing content:', error); throw error; } } /** * Performs a Google search and shows the results * @param {string} query - Search query * @returns {Promise<void>} */ async function search(query) { try { console.log('Getting google html'); const html = await getGoogleHtml(query); console.log("parsing html"); const parsed = await searchParser(html); console.log(parsed.choices[0].message.content); } catch (error) { console.error('Search error:', error); throw error; } } // Main function to handle command line usage async function main() { const [command, ...args] = process.argv.slice(2); if (!command) { console.error('Please provide a command.'); console.error('Usage:'); console.error(' pnpm run tool scrape <html|text> <url>'); console.error(' pnpm run tool search <query>'); process.exit(1); } try { switch (command.toLowerCase()) { case 'scrape': if (args.length < 2) { console.error('Please provide the format and URL to scrape.'); console.error('Usage: pnpm run tool scrape <html|text> <url>'); process.exit(1); } const format = args[0].toLowerCase(); if (format !== 'html' && format !== 'text') { console.error('Invalid format. Use either "html" or "text".'); process.exit(1); } await showContent(format, args[1]); break; case 'search': if (args.length < 1) { console.error('Please provide a search query.'); console.error('Usage: pnpm run tool search <query>'); process.exit(1); } await search(args.join(' ')); break; default: console.error('Invalid command. Use either "scrape" or "search".'); console.error('Usage:'); console.error(' pnpm run tool scrape <html|text> <url>'); console.error(' pnpm run tool search <query>'); process.exit(1); } } catch (error) { console.error('Operation failed:', error); process.exit(1); } } // Run main function if this file is run directly if (process.argv[1] === fileURLToPath(import.meta.url)) { main(); } export { scrapeUrl, getGoogleHtml };