Skip to content

Instantly share code, notes, and snippets.

@uratmangun
Created December 28, 2024 14:19
import { chromium } from 'playwright';
import { fileURLToPath } from 'url';
import dotenv from 'dotenv';
import { searchParser } from './convert-search.mjs';
dotenv.config({ path: '.env.local' });
/**
* Scrapes content from a given URL using Playwright with CDP connection
* @param {string} url - The URL to scrape
* @param {string} format - The output format ('html' or 'text')
* @returns {Promise<Object>} - The scraped content
*/
async function scrapeUrl(url, format = 'html') {
if (!process.env.BRIGHT_PLAYWRIGHT_URL) {
throw new Error('BRIGHT_PLAYWRIGHT_URL environment variable is not set');
}
const browser = await chromium.connectOverCDP(process.env.BRIGHT_PLAYWRIGHT_URL);
try {
const context = await browser.newContext();
const page = await context.newPage();
await page.goto(url, { timeout: 60000 });
await page.waitForLoadState('networkidle');
if (format === 'text') {
const text = await page.textContent('body');
return { text };
} else {
const html = await page.content();
return { html };
}
} catch (error) {
console.error('Error during scraping:', error);
throw error;
} finally {
await browser.close();
}
}
/**
* Gets Google search results HTML
* @param {string} query - Search query
* @param {string} baseUrl - Base URL for Google search
* @returns {Promise<string>} - HTML content
*/
async function getGoogleHtml(query = '', baseUrl = 'https://www.google.com/search?q=') {
const browser = await chromium.connectOverCDP(process.env.BRIGHT_PLAYWRIGHT_URL);
const searchUrl = baseUrl + encodeURIComponent(query);
try {
const context = await browser.newContext();
const page = await context.newPage();
await page.goto(searchUrl);
const bodyContent = await page.evaluate(() => {
const body = document.body;
if (!body) return '';
// Remove all script tags
const scripts = body.getElementsByTagName('script');
while (scripts.length > 0) {
scripts[0].parentNode.removeChild(scripts[0]);
}
// Remove all style tags
const styles = body.getElementsByTagName('style');
while (styles.length > 0) {
styles[0].parentNode.removeChild(styles[0]);
}
// Remove all noscript tags
const noscripts = body.getElementsByTagName('noscript');
while (noscripts.length > 0) {
noscripts[0].parentNode.removeChild(noscripts[0]);
}
return body.innerHTML;
});
return bodyContent;
} catch (error) {
console.error('Error fetching page:', error);
throw error;
} finally {
await browser.close();
}
}
/**
* Shows the content of a given URL in specified format
* @param {string} format - The output format ('html' or 'text')
* @param {string} url - The URL to show content from
* @returns {Promise<void>}
*/
async function showContent(format, url) {
try {
const result = await scrapeUrl(url, format);
console.log(result[format]);
} catch (error) {
console.error('Error showing content:', error);
throw error;
}
}
/**
* Performs a Google search and shows the results
* @param {string} query - Search query
* @returns {Promise<void>}
*/
async function search(query) {
try {
console.log('Getting google html');
const html = await getGoogleHtml(query);
console.log("parsing html");
const parsed = await searchParser(html);
console.log(parsed.choices[0].message.content);
} catch (error) {
console.error('Search error:', error);
throw error;
}
}
// Main function to handle command line usage
async function main() {
const [command, ...args] = process.argv.slice(2);
if (!command) {
console.error('Please provide a command.');
console.error('Usage:');
console.error(' pnpm run tool scrape <html|text> <url>');
console.error(' pnpm run tool search <query>');
process.exit(1);
}
try {
switch (command.toLowerCase()) {
case 'scrape':
if (args.length < 2) {
console.error('Please provide the format and URL to scrape.');
console.error('Usage: pnpm run tool scrape <html|text> <url>');
process.exit(1);
}
const format = args[0].toLowerCase();
if (format !== 'html' && format !== 'text') {
console.error('Invalid format. Use either "html" or "text".');
process.exit(1);
}
await showContent(format, args[1]);
break;
case 'search':
if (args.length < 1) {
console.error('Please provide a search query.');
console.error('Usage: pnpm run tool search <query>');
process.exit(1);
}
await search(args.join(' '));
break;
default:
console.error('Invalid command. Use either "scrape" or "search".');
console.error('Usage:');
console.error(' pnpm run tool scrape <html|text> <url>');
console.error(' pnpm run tool search <query>');
process.exit(1);
}
} catch (error) {
console.error('Operation failed:', error);
process.exit(1);
}
}
// Run main function if this file is run directly
if (process.argv[1] === fileURLToPath(import.meta.url)) {
main();
}
export { scrapeUrl, getGoogleHtml };
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment