Skip to content

Instantly share code, notes, and snippets.

@nocodesupplyco
Last active February 14, 2024 18:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nocodesupplyco/4afd4808f1d2f963cd782e940c072ba9 to your computer and use it in GitHub Desktop.
Save nocodesupplyco/4afd4808f1d2f963cd782e940c072ba9 to your computer and use it in GitHub Desktop.
Scrape Website Data w/ Airtable Script
// Define settings fields and descriptions
const config = input.config({
title: "Scrape Website Data",
description: "Scrape a website URL in each record within a specific view to capture some specific content types using [Rasterwise.com](https://www.rasterwise.com/). Script created by [Base Scripts](https://nocodesupply.co/basescripts) — use at your own discretion.",
items: [
input.config.text("apiKey", {
label: "Rasterwise API Key",
description: "Sign up for a Rasterwise account, and retrieve your API key from your account page.",
}),
input.config.table("tableSelect", {
label: "Table",
description: "Select the table where the website URLs are stored to scraped.",
}),
input.config.view("viewSelect", {
label: "View",
description: "Select the view where the website URLs are stored to scraped.",
parentTable: "tableSelect",
}),
input.config.field("linkSelect", {
label: "Link Field",
description: "Select the URL field of the website to scrape.",
parentTable: "tableSelect",
}),
input.config.select("contentType", {
label: "Content Type to Scrape",
description: "Select the type of content you wish to extract from the website. See the docs.rasterwise.com for more ways to modify the script for scrape specificity.",
options: [
{ label: "Meta Tags", value: "meta" },
{ label: "Emails", value: "emails" },
{ label: "Phones", value: "phones" },
{ label: "Dates", value: "dates" },
{ label: "Lists", value: "lists" },
{ label: "Headings", value: "headings" },
{ label: "Text", value: "text" },
{ label: "Images", value: "images" },
{ label: "Links", value: "links" },
{ label: "Social", value: "social" },
{ label: "Rawdom", value: "rawdom" },
],
}),
input.config.text("outputTarget", {
label: "Response JSON Target",
description: "Specify the JSON object to target from the response that is output as a console log in your first test. For example, if using the 'Meta Tags' content type to get a meta title you would enter 'results.metadata.title'",
}),
input.config.field("outputField", {
label: "Output Field",
description: "Select the field where the extracted content should be added.",
parentTable: "tableSelect",
}),
input.config.field("errorSelect", {
label: "Scrape Error Field",
description: "Select a checkbox field that can get checked if the API returns an error.",
parentTable: "tableSelect",
}),
],
});
// Set config choices to variables
const configAPIKey = config.apiKey;
const configContentType = config.contentType;
const configOutputField = config.outputField.name;
const configOutputTarget = config.outputTarget;
const configTable = config.tableSelect.name;
const configView = config.viewSelect.name;
const configLink = config["linkSelect"].name;
const configError = config["errorSelect"].name;
// Set table and view
const table = base.getTable(configTable);
const view = table.getView(configView);
// Get all link fields
let link = await view.selectRecordsAsync({ fields: [configLink] });
// Function to safely access nested JSON property
function getNestedJsonValue(json, path) {
return path.split(".").reduce((currentObject, pathPart) => {
return currentObject ? currentObject[pathPart] : undefined;
}, json);
}
// Loop through each link field to get size
for (let record of link.records) {
// Get the URL from the link field
let linkValue = record.getCellValue(configLink);
// Create API URL from variables above
let fetchURL = "https://api.rasterwise.com/v1/get-scrape?apikey=" + configAPIKey + "&url=" + linkValue + "&extract=" + configContentType;
// Make API request with URL
let response = await remoteFetchAsync(fetchURL);
if (!response.ok) {
//send errors to console and continue to next loop item
console.error(response);
await table.updateRecordAsync(record, {
[configError]: true,
});
continue;
} else {
// Convert response to JSON
let json = await response.json();
console.log(json);
// Extract the value using the JSON path
let outputValue = getNestedJsonValue(json, configOutputTarget);
// Check if the value exists, if not, you can handle it accordingly
if (outputValue === undefined) {
console.warn("No value found at path:", configOutputTarget);
// You can decide how to handle this case. For example, you might skip updating the record or set a default value.
continue; // Skipping the record update in this case
}
// Update records with response data
await table.updateRecordAsync(record, {
[configOutputField]: outputValue,
});
console.log("Scraped:" + linkValue);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment