Skip to content

Instantly share code, notes, and snippets.

@idhamsy
Last active December 26, 2024 03:36
Show Gist options
  • Save idhamsy/a6c312a0dfcf8d8338bb5b25e0a29bc6 to your computer and use it in GitHub Desktop.
Save idhamsy/a6c312a0dfcf8d8338bb5b25e0a29bc6 to your computer and use it in GitHub Desktop.
/**
* Fetches a webpage and extracts specific or default components (title, headings, paragraphs, etc.).
*
* Usage Examples:
* 1) Single Parameter:
* =GETURL("https://www.example.com")
* - Extracts the default set of components (title, h1-h6, p, etc.) as configured in the script.
*
* 2) Two Parameters:
* =GETURL("https://www.example.com", "title")
* - Extracts only the <title> of the page.
*
* @param {string} url The URL of the webpage you want to scrape (e.g. "https://example.com").
* @param {string} [component] (Optional) The element or metadata to extract, e.g. "title", "h1", "meta:description".
* If omitted, the script returns multiple default components.
* @return {string} The extracted content from the webpage.
* @customfunction
*/
function GETURL(url, component) {
// ========== CUSTOMIZABLE SETTINGS ==========
/**
* Maximum number of characters returned in the final output.
* You can increase this if you want more text per cell.
*/
var max_character_returned = 10000;
/**
* Whether to follow HTTP redirects automatically.
* Valid values: true or false.
*/
var follow_redirect = true;
/**
* Maximum number of times to retry fetching the URL if it fails.
*/
var retry_max = 3;
/**
* The User-Agent string to send with the request.
* Some websites behave differently based on the User-Agent.
*/
var user_agent = "Mozilla/5.0";
/**
* A list of components to extract if 'component' parameter is not provided.
* You can add/remove items (e.g. "img", "table") as desired.
*/
var defaultExtractionList = ["title", "h1", "h2", "h3", "h4", "h5", "h6", "p"];
// ========== FUNCTION LOGIC ==========
// Basic validation
if (!url) {
Logger.log("Error: Missing URL parameter.");
return "Error: Please provide a URL.";
}
try {
Logger.log("GETURL called with URL: " + url +
(component ? (", component: " + component) : ", no component provided"));
// Fetch HTML with retries
var htmlContent = fetchWithRetries(url, follow_redirect, retry_max, user_agent);
if (!htmlContent) {
Logger.log("Error: Unable to fetch the URL after retries: " + url);
return "Error: Unable to fetch the provided URL.";
}
// Attempt to parse HTML normally
var document = parseHtmlRobust(htmlContent);
if (!document) {
Logger.log("First parse attempt failed. Trying to clean the HTML and re-parse...");
var cleaned = cleanHtmlContent(htmlContent);
document = parseHtmlRobust(cleaned);
}
// If no component is provided, use the defaultExtractionList
if (!component) {
// Gather the results from all default components
var results = [];
for (var i = 0; i < defaultExtractionList.length; i++) {
var comp = defaultExtractionList[i];
var extracted = "";
if (document) {
extracted = extractComponent(document, comp);
} else {
Logger.log("Unable to parse. Using regex fallback for comp: " + comp);
extracted = regexFallback(htmlContent, comp);
}
// If content is found, label it
if (extracted && extracted.trim() !== "") {
results.push("[" + comp + "]\n" + extracted.trim());
}
}
var finalResult = results.join("\n\n").trim();
if (!finalResult) {
Logger.log("No content found for default extraction list in URL: " + url);
finalResult = "No content found (default extraction).";
}
// Truncate if too long
if (finalResult.length > max_character_returned) {
Logger.log("Result exceeded max character limit. Truncating output.");
finalResult = finalResult.substring(0, max_character_returned) + "...";
}
return finalResult;
}
// If we do have a component provided
var result = "";
if (document) {
// Extract normally if the HTML was parsed
result = extractComponent(document, component);
} else {
Logger.log("Unable to parse even after cleaning. Using regex fallback for component: " + component);
result = regexFallback(htmlContent, component);
}
// If no result found
if (!result || result.trim() === "") {
Logger.log("No content found for component: " + component + " in URL: " + url);
result = "No content found for component: " + component;
}
// Truncate to max_character_returned
if (result.length > max_character_returned) {
Logger.log("Result exceeded max character limit. Truncating output.");
result = result.substring(0, max_character_returned) + "...";
}
return result;
} catch (e) {
Logger.log("Unexpected error in GETURL function: " + e.toString());
return "Error: An unexpected issue occurred. " + e.toString();
}
}
/**
* Attempts to fetch a URL with multiple retries.
*
* @param {string} url The URL to fetch.
* @param {boolean} follow_redirect Whether to follow redirects.
* @param {number} retry_max How many times to retry fetching on failure.
* @param {string} user_agent The User-Agent header to send.
* @return {string|null} The raw HTML text if successful, or null if not.
*/
function fetchWithRetries(url, follow_redirect, retry_max, user_agent) {
var attempts = 0;
var response = null;
var fetchOptions = {
followRedirects: follow_redirect,
muteHttpExceptions: true,
headers: {
"User-Agent": user_agent
}
};
while (attempts < retry_max) {
try {
Logger.log("Attempting fetch #" + (attempts + 1) + " for URL: " + url);
response = UrlFetchApp.fetch(url, fetchOptions);
var code = response.getResponseCode();
Logger.log("HTTP response code: " + code + " for URL: " + url);
if (code >= 200 && code < 300) {
return response.getContentText();
} else {
Logger.log("Non-2xx response code. Retrying...");
attempts++;
Utilities.sleep(500); // brief delay
}
} catch (e) {
Logger.log("Error fetching URL: " + url + " on attempt " + (attempts + 1) + ": " + e.toString());
attempts++;
Utilities.sleep(500); // brief delay
}
}
Logger.log("Failed to fetch URL after " + retry_max + " attempts: " + url);
return null;
}
/**
* Parses HTML robustly using XmlService. Returns the root element or null on failure.
*
* @param {string} htmlContent Raw HTML string.
* @return {XmlService.Element|null} Parsed document root element, or null if parsing fails.
*/
function parseHtmlRobust(htmlContent) {
try {
var htmlOutput = HtmlService.createHtmlOutput(htmlContent).getContent();
var document = XmlService.parse(htmlOutput);
return document.getRootElement();
} catch (e) {
Logger.log("Error parsing HTML using XmlService: " + e.toString());
return null;
}
}
/**
* Cleans HTML content to fix invalid entities that can break XmlService parsing.
*
* @param {string} htmlContent Raw HTML string.
* @return {string} Cleaned HTML string.
*/
function cleanHtmlContent(htmlContent) {
Logger.log("Cleaning HTML content to fix invalid entities.");
// Replace standalone '&' with '&amp;' except known valid entities
return htmlContent.replace(/&(?!(amp;|lt;|gt;|quot;|apos;|#[0-9]+;|#[xX][0-9A-Fa-f]+;))/g, '&amp;');
}
/**
* Extracts content from a parsed document based on the requested component.
*
* @param {XmlService.Element} document Parsed HTML document root.
* @param {string} component Which element or meta to extract.
* @return {string} Extracted content or an empty string if not found.
*/
function extractComponent(document, component) {
try {
var lc = component.toLowerCase();
if (lc.startsWith("meta:")) {
var metaName = lc.split(":")[1];
return getMetaContent(document, metaName);
}
switch (lc) {
case "title":
return getTagText(document, "title");
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "p":
case "div":
case "li":
case "ul":
case "table":
return getAllTagText(document, lc);
case "img":
return getAllImageSrc(document);
default:
Logger.log("Unknown component: " + component + ". Trying as a tag.");
return getAllTagText(document, lc);
}
} catch (e) {
Logger.log("Error extracting component (" + component + "): " + e.toString());
return "";
}
}
/**
* Extracts content of a meta tag by the 'name' attribute.
*
* @param {XmlService.Element} document Parsed HTML document root.
* @param {string} metaName The meta name to look for (e.g. "description").
* @return {string} The content attribute if found, otherwise an empty string.
*/
function getMetaContent(document, metaName) {
var metas = document.getDescendants().filter(function(d) {
return d.getType() === XmlService.Element && d.getName() === 'meta';
}).map(function(elem) {
return elem.asElement();
});
for (var i = 0; i < metas.length; i++) {
var nameAttr = metas[i].getAttribute('name');
if (nameAttr && nameAttr.getValue().toLowerCase() === metaName) {
var contentAttr = metas[i].getAttribute('content');
if (contentAttr) {
return contentAttr.getValue();
}
}
}
return "";
}
/**
* Returns the text content of the first occurrence of a specific tag.
*
* @param {XmlService.Element} document Parsed HTML document root.
* @param {string} tagName e.g. "title", "p".
* @return {string} The text of the tag or empty if not found.
*/
function getTagText(document, tagName) {
var elements = document.getDescendants().filter(function(d) {
return d.getType() === XmlService.Element && d.getName().toLowerCase() === tagName;
});
if (elements.length > 0) {
return elements[0].asElement().getText();
}
return "";
}
/**
* Returns the concatenated text content of all elements with a certain tag.
*
* @param {XmlService.Element} document Parsed HTML document root.
* @param {string} tagName e.g. "p", "h1", "div".
* @return {string} Joined text from all found tags.
*/
function getAllTagText(document, tagName) {
var elements = document.getDescendants().filter(function(d) {
return d.getType() === XmlService.Element && d.getName().toLowerCase() === tagName;
});
var texts = elements.map(function(e) {
return e.asElement().getText();
});
return texts.join("\n").trim();
}
/**
* Returns all image src attributes from the parsed document.
*
* @param {XmlService.Element} document Parsed HTML document root.
* @return {string} All <img src="..."> URLs, one per line.
*/
function getAllImageSrc(document) {
var elements = document.getDescendants().filter(function(d) {
return d.getType() === XmlService.Element && d.getName().toLowerCase() === "img";
});
var srcs = [];
for (var i = 0; i < elements.length; i++) {
var srcAttr = elements[i].asElement().getAttribute("src");
if (srcAttr) {
srcs.push(srcAttr.getValue());
}
}
return srcs.join("\n");
}
/**
* Regex fallback for when XmlService parsing fails.
* Useful for simpler elements like <title>, headings, <p>, etc.
*
* @param {string} html Raw HTML string.
* @param {string} component The component name to extract.
* @return {string} Extracted text or an empty string if not found.
*/
function regexFallback(html, component) {
Logger.log("Using regex fallback for component: " + component);
var lc = component.toLowerCase();
// Fallback for <title>
if (lc === "title") {
var titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
return titleMatch && titleMatch[1] ? titleMatch[1].trim() : "";
}
// Fallback for meta:xxx
if (lc.startsWith("meta:")) {
var metaName = lc.split(":")[1];
var metaRegex = new RegExp('<meta[^>]*name=["\']' + metaName + '["\'][^>]*content=["\']([^"\']*)["\']', 'i');
var metaMatch = html.match(metaRegex);
return metaMatch && metaMatch[1] ? metaMatch[1].trim() : "";
}
// Headings, paragraphs, divs, lists, tables
if (["h1","h2","h3","h4","h5","h6","p","div","li","ul","table"].indexOf(lc) !== -1) {
var tagRegex = new RegExp("<" + lc + "[^>]*>([\\s\\S]*?)<\\/" + lc + ">", "gi");
var matches = [];
var m;
while ((m = tagRegex.exec(html)) !== null) {
var textContent = m[1].replace(/<[^>]+>/g, '').trim();
if (textContent) matches.push(textContent);
}
return matches.join("\n");
}
// Fallback for img
if (lc === "img") {
var imgRegex = /<img[^>]+src=["']([^"']+)["']/gi;
var imgMatches = [];
var im;
while ((im = imgRegex.exec(html)) !== null) {
imgMatches.push(im[1]);
}
return imgMatches.join("\n");
}
// If unknown component
return "";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment