Last active
February 14, 2024 18:15
-
-
Save nocodesupplyco/4afd4808f1d2f963cd782e940c072ba9 to your computer and use it in GitHub Desktop.
Scrape Website Data w/ Airtable Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Define settings fields and descriptions | |
const config = input.config({ | |
title: "Scrape Website Data", | |
description: "Scrape a website URL in each record within a specific view to capture some specific content types using [Rasterwise.com](https://www.rasterwise.com/). Script created by [Base Scripts](https://nocodesupply.co/basescripts) — use at your own discretion.", | |
items: [ | |
input.config.text("apiKey", { | |
label: "Rasterwise API Key", | |
description: "Sign up for a Rasterwise account, and retrieve your API key from your account page.", | |
}), | |
input.config.table("tableSelect", { | |
label: "Table", | |
description: "Select the table where the website URLs are stored to scraped.", | |
}), | |
input.config.view("viewSelect", { | |
label: "View", | |
description: "Select the view where the website URLs are stored to scraped.", | |
parentTable: "tableSelect", | |
}), | |
input.config.field("linkSelect", { | |
label: "Link Field", | |
description: "Select the URL field of the website to scrape.", | |
parentTable: "tableSelect", | |
}), | |
input.config.select("contentType", { | |
label: "Content Type to Scrape", | |
description: "Select the type of content you wish to extract from the website. See the docs.rasterwise.com for more ways to modify the script for scrape specificity.", | |
options: [ | |
{ label: "Meta Tags", value: "meta" }, | |
{ label: "Emails", value: "emails" }, | |
{ label: "Phones", value: "phones" }, | |
{ label: "Dates", value: "dates" }, | |
{ label: "Lists", value: "lists" }, | |
{ label: "Headings", value: "headings" }, | |
{ label: "Text", value: "text" }, | |
{ label: "Images", value: "images" }, | |
{ label: "Links", value: "links" }, | |
{ label: "Social", value: "social" }, | |
{ label: "Rawdom", value: "rawdom" }, | |
], | |
}), | |
input.config.text("outputTarget", { | |
label: "Response JSON Target", | |
description: "Specify the JSON object to target from the response that is output as a console log in your first test. For example, if using the 'Meta Tags' content type to get a meta title you would enter 'results.metadata.title'", | |
}), | |
input.config.field("outputField", { | |
label: "Output Field", | |
description: "Select the field where the extracted content should be added.", | |
parentTable: "tableSelect", | |
}), | |
input.config.field("errorSelect", { | |
label: "Scrape Error Field", | |
description: "Select a checkbox field that can get checked if the API returns an error.", | |
parentTable: "tableSelect", | |
}), | |
], | |
}); | |
// Set config choices to variables | |
const configAPIKey = config.apiKey; | |
const configContentType = config.contentType; | |
const configOutputField = config.outputField.name; | |
const configOutputTarget = config.outputTarget; | |
const configTable = config.tableSelect.name; | |
const configView = config.viewSelect.name; | |
const configLink = config["linkSelect"].name; | |
const configError = config["errorSelect"].name; | |
// Set table and view | |
const table = base.getTable(configTable); | |
const view = table.getView(configView); | |
// Get all link fields | |
let link = await view.selectRecordsAsync({ fields: [configLink] }); | |
// Function to safely access nested JSON property | |
function getNestedJsonValue(json, path) { | |
return path.split(".").reduce((currentObject, pathPart) => { | |
return currentObject ? currentObject[pathPart] : undefined; | |
}, json); | |
} | |
// Loop through each link field to get size | |
for (let record of link.records) { | |
// Get the URL from the link field | |
let linkValue = record.getCellValue(configLink); | |
// Create API URL from variables above | |
let fetchURL = "https://api.rasterwise.com/v1/get-scrape?apikey=" + configAPIKey + "&url=" + linkValue + "&extract=" + configContentType; | |
// Make API request with URL | |
let response = await remoteFetchAsync(fetchURL); | |
if (!response.ok) { | |
//send errors to console and continue to next loop item | |
console.error(response); | |
await table.updateRecordAsync(record, { | |
[configError]: true, | |
}); | |
continue; | |
} else { | |
// Convert response to JSON | |
let json = await response.json(); | |
console.log(json); | |
// Extract the value using the JSON path | |
let outputValue = getNestedJsonValue(json, configOutputTarget); | |
// Check if the value exists, if not, you can handle it accordingly | |
if (outputValue === undefined) { | |
console.warn("No value found at path:", configOutputTarget); | |
// You can decide how to handle this case. For example, you might skip updating the record or set a default value. | |
continue; // Skipping the record update in this case | |
} | |
// Update records with response data | |
await table.updateRecordAsync(record, { | |
[configOutputField]: outputValue, | |
}); | |
console.log("Scraped:" + linkValue); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment