Skip to content

Instantly share code, notes, and snippets.

@domderen
Last active September 11, 2023 19:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save domderen/a12ba5804b5cc5b41f2dd69c0cd11c2b to your computer and use it in GitHub Desktop.
Save domderen/a12ba5804b5cc5b41f2dd69c0cd11c2b to your computer and use it in GitHub Desktop.
import * as playwright from 'playwright';
/**
* Defines type used to extract complex values from the page.
*/
type CompositeSelector = {
selector?: string;
attributeName?: string;
}
/**
* Finds a node with a specified selector, and either extracts it's text content,
* or if an xpath selector is used, it can extract value of a specific attribute.
*
* Eg. "div.something" -> Will extact text content of a div with class "something",
* "//div[contains(@class, 'something')]/@data-id" -> Will extract value of the attribute "data-id" on the same element as above.
*
* @param page Playwright Page class.
* @param sel Playwright selector for obtaining a value
*/
async function getValue(page: playwright.Page, sel: string): Promise<string> {
const { attributeName, selector } = parseXpathSelector(sel);
return await page.$eval(selector, (el, attributeName) => {
if(attributeName) {
return el.getAttribute(attributeName);
}
return el.textContent.trim();
}, attributeName)
}
/**
* Finds an array of nodes with a specified selector, and either extracts their text content,
* or if an xpath selector is used, it can extract value of a specific attribute on each matching element.
*
* Eg. "div.something" -> Will extact text content of all divs with class "something",
* "//div[contains(@class, 'something')]/@data-id" -> Will extract values of the attribute "data-id" on the same elements as above.
*
* @param page Playwright Page class.
* @param sel Playwright selector for obtaining a values
*/
async function getValues(page: playwright.Page, sel: string): Promise<string[]> {
const { attributeName, selector } = parseXpathSelector(sel);
return await page.$$eval(selector, (els, attributeName) => {
if(attributeName) {
return els.map(el => el.getAttribute(attributeName));
}
return els.map(el => el.textContent.trim());
}, attributeName)
}
/**
* Extracts an object of values from the page. Return object will have the same keys as "propertySelectors" input parameter,
* and values will equal to extracted text from the page for the specified selectors.
* @param page Playwright Page class
* @param propertySelectors Object defining the structure of return object and selectors for extracting values.
* @param objectSelector Selector defining parent element containing properties we want to extract.
*/
async function getObject(page: playwright.Page, propertySelectors: {[key: string]: string | CompositeSelector}, objectSelector: string | undefined = undefined): Promise<{[key: string]: string}> {
const startObject: {[key: string]: CompositeSelector} = {};
const compositePropertySelectors = Object.entries(propertySelectors).reduce((init, next) => {
if(typeof next[1] === 'string') {
init[next[0]] = parseXpathSelector(next[1]);
} else {
init[next[0]] = next[1];
}
return init;
}, startObject);
return await page.$eval(objectSelector || 'document', (el, compositePropertySelectors) => {
const response: {[key: string]: string} = {};
return Object.keys(compositePropertySelectors).reduce((init, key) => {
const value = compositePropertySelectors[key];
const node = value.selector ? el.querySelector(value.selector) : el;
if(!node) {
init[key] = null;
} else {
init[key] = value.attributeName ? node.getAttribute(value.attributeName) : node.textContent.trim();
}
return init;
}, response);
}, compositePropertySelectors)
}
/**
* Extracts an array of objects from the page. Return objects will have the same keys as "propertySelectors" input parameter,
* and values will equal to extracted text from the page for the specified selectors.
* @param page Playwright Page class
* @param propertySelectors Object defining the structure of return object and selectors for extracting values.
* @param objectSelector Selector defining parent element containing properties we want to extract.
*/
async function getObjects(page: playwright.Page, propertySelectors: {[key: string]: string | CompositeSelector}, objectSelector: string | undefined = undefined): Promise<{[key: string]: string}[]> {
const startObject: {[key: string]: CompositeSelector} = {};
const compositePropertySelectors = Object.entries(propertySelectors).reduce((init, next) => {
if(typeof next[1] === 'string') {
init[next[0]] = parseXpathSelector(next[1]);
} else {
init[next[0]] = next[1];
}
return init;
}, startObject);
return await page.$$eval(objectSelector || 'document', (els, compositePropertySelectors) => {
return els.map(el => {
const response: {[key: string]: string} = {};
return Object.keys(compositePropertySelectors).reduce((init, key) => {
const value = compositePropertySelectors[key];
const node = value.selector ? el.querySelector(value.selector) : el;
if(!node) {
init[key] = null;
} else {
init[key] = value.attributeName ? node.getAttribute(value.attributeName) : node.textContent.trim();
}
return init;
}, response);
});
}, compositePropertySelectors)
}
/**
* Checks if the provided selector is an xpath one, and if so, tries to parse out an attribute name to be extracted.
* @param selector a Playwright Selector.
*/
function parseXpathSelector(selector: string): CompositeSelector {
let attributeName: string
if(selector.startsWith('//') || selector.startsWith('..') || selector.startsWith('xpath=')) {
const matchResults = selector.match(/(\/@.*)$/g);
if(matchResults && matchResults.length) {
attributeName = matchResults[0].substr(2);
selector = selector.replace(matchResults[0], '');
}
}
return {attributeName, selector};
}
/**
* This function launches a browser, goes to the provided URL,
* and attempts to extract values from the page, either as simple labels, or complex objects.
*/
async function main () {
try {
const browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto('https://news.ycombinator.com/');
const headline = await getValue(page, "//tbody/tr[contains(@class, 'athing')]/td[contains(@class, 'title')]/a[contains(@class, 'storylink')]");
console.log(headline);
// "Some headline"
const articleLinks = await getValues(page, "//tbody/tr[contains(@class, 'athing')]/td[contains(@class, 'title')]/a[contains(@class, 'storylink')]/@href");
console.log(articleLinks);
// ["https://github.com/whatever", "https://github.com/whatever2", ...]
const selectors = {
source: {selector: "span.sitestr"},
headline: "a.storylink",
link: {selector: "a.storylink", attributeName: "href"}
}
const article = await getObject(page, selectors, "//tbody/tr[contains(@class, 'athing')]");
console.log(article);
// {source: "github.com", headline: "Some headline", link: "https://github.com/whatever"}
const articles = await getObjects(page, selectors, "//tbody/tr[contains(@class, 'athing')]");
console.log(articles);
// [{source: "github.com", headline: "Some headline", link: "https://github.com/whatever"}, ...]
} catch (error) {
console.log('UNEXPECTED ERROR', error);
process.exit(1);
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment