-
-
Save idhamsy/a6c312a0dfcf8d8338bb5b25e0a29bc6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Fetches a webpage and extracts specific or default components (title, headings, paragraphs, etc.). | |
* | |
* Usage Examples: | |
* 1) Single Parameter: | |
* =GETURL("https://www.example.com") | |
* - Extracts the default set of components (title, h1-h6, p, etc.) as configured in the script. | |
* | |
* 2) Two Parameters: | |
* =GETURL("https://www.example.com", "title") | |
* - Extracts only the <title> of the page. | |
* | |
* @param {string} url The URL of the webpage you want to scrape (e.g. "https://example.com"). | |
* @param {string} [component] (Optional) The element or metadata to extract, e.g. "title", "h1", "meta:description". | |
* If omitted, the script returns multiple default components. | |
* @return {string} The extracted content from the webpage. | |
* @customfunction | |
*/ | |
function GETURL(url, component) { | |
// ========== CUSTOMIZABLE SETTINGS ========== | |
/** | |
* Maximum number of characters returned in the final output. | |
* You can increase this if you want more text per cell. | |
*/ | |
var max_character_returned = 10000; | |
/** | |
* Whether to follow HTTP redirects automatically. | |
* Valid values: true or false. | |
*/ | |
var follow_redirect = true; | |
/** | |
* Maximum number of times to retry fetching the URL if it fails. | |
*/ | |
var retry_max = 3; | |
/** | |
* The User-Agent string to send with the request. | |
* Some websites behave differently based on the User-Agent. | |
*/ | |
var user_agent = "Mozilla/5.0"; | |
/** | |
* A list of components to extract if 'component' parameter is not provided. | |
* You can add/remove items (e.g. "img", "table") as desired. | |
*/ | |
var defaultExtractionList = ["title", "h1", "h2", "h3", "h4", "h5", "h6", "p"]; | |
// ========== FUNCTION LOGIC ========== | |
// Basic validation | |
if (!url) { | |
Logger.log("Error: Missing URL parameter."); | |
return "Error: Please provide a URL."; | |
} | |
try { | |
Logger.log("GETURL called with URL: " + url + | |
(component ? (", component: " + component) : ", no component provided")); | |
// Fetch HTML with retries | |
var htmlContent = fetchWithRetries(url, follow_redirect, retry_max, user_agent); | |
if (!htmlContent) { | |
Logger.log("Error: Unable to fetch the URL after retries: " + url); | |
return "Error: Unable to fetch the provided URL."; | |
} | |
// Attempt to parse HTML normally | |
var document = parseHtmlRobust(htmlContent); | |
if (!document) { | |
Logger.log("First parse attempt failed. Trying to clean the HTML and re-parse..."); | |
var cleaned = cleanHtmlContent(htmlContent); | |
document = parseHtmlRobust(cleaned); | |
} | |
// If no component is provided, use the defaultExtractionList | |
if (!component) { | |
// Gather the results from all default components | |
var results = []; | |
for (var i = 0; i < defaultExtractionList.length; i++) { | |
var comp = defaultExtractionList[i]; | |
var extracted = ""; | |
if (document) { | |
extracted = extractComponent(document, comp); | |
} else { | |
Logger.log("Unable to parse. Using regex fallback for comp: " + comp); | |
extracted = regexFallback(htmlContent, comp); | |
} | |
// If content is found, label it | |
if (extracted && extracted.trim() !== "") { | |
results.push("[" + comp + "]\n" + extracted.trim()); | |
} | |
} | |
var finalResult = results.join("\n\n").trim(); | |
if (!finalResult) { | |
Logger.log("No content found for default extraction list in URL: " + url); | |
finalResult = "No content found (default extraction)."; | |
} | |
// Truncate if too long | |
if (finalResult.length > max_character_returned) { | |
Logger.log("Result exceeded max character limit. Truncating output."); | |
finalResult = finalResult.substring(0, max_character_returned) + "..."; | |
} | |
return finalResult; | |
} | |
// If we do have a component provided | |
var result = ""; | |
if (document) { | |
// Extract normally if the HTML was parsed | |
result = extractComponent(document, component); | |
} else { | |
Logger.log("Unable to parse even after cleaning. Using regex fallback for component: " + component); | |
result = regexFallback(htmlContent, component); | |
} | |
// If no result found | |
if (!result || result.trim() === "") { | |
Logger.log("No content found for component: " + component + " in URL: " + url); | |
result = "No content found for component: " + component; | |
} | |
// Truncate to max_character_returned | |
if (result.length > max_character_returned) { | |
Logger.log("Result exceeded max character limit. Truncating output."); | |
result = result.substring(0, max_character_returned) + "..."; | |
} | |
return result; | |
} catch (e) { | |
Logger.log("Unexpected error in GETURL function: " + e.toString()); | |
return "Error: An unexpected issue occurred. " + e.toString(); | |
} | |
} | |
/** | |
* Attempts to fetch a URL with multiple retries. | |
* | |
* @param {string} url The URL to fetch. | |
* @param {boolean} follow_redirect Whether to follow redirects. | |
* @param {number} retry_max How many times to retry fetching on failure. | |
* @param {string} user_agent The User-Agent header to send. | |
* @return {string|null} The raw HTML text if successful, or null if not. | |
*/ | |
function fetchWithRetries(url, follow_redirect, retry_max, user_agent) { | |
var attempts = 0; | |
var response = null; | |
var fetchOptions = { | |
followRedirects: follow_redirect, | |
muteHttpExceptions: true, | |
headers: { | |
"User-Agent": user_agent | |
} | |
}; | |
while (attempts < retry_max) { | |
try { | |
Logger.log("Attempting fetch #" + (attempts + 1) + " for URL: " + url); | |
response = UrlFetchApp.fetch(url, fetchOptions); | |
var code = response.getResponseCode(); | |
Logger.log("HTTP response code: " + code + " for URL: " + url); | |
if (code >= 200 && code < 300) { | |
return response.getContentText(); | |
} else { | |
Logger.log("Non-2xx response code. Retrying..."); | |
attempts++; | |
Utilities.sleep(500); // brief delay | |
} | |
} catch (e) { | |
Logger.log("Error fetching URL: " + url + " on attempt " + (attempts + 1) + ": " + e.toString()); | |
attempts++; | |
Utilities.sleep(500); // brief delay | |
} | |
} | |
Logger.log("Failed to fetch URL after " + retry_max + " attempts: " + url); | |
return null; | |
} | |
/** | |
* Parses HTML robustly using XmlService. Returns the root element or null on failure. | |
* | |
* @param {string} htmlContent Raw HTML string. | |
* @return {XmlService.Element|null} Parsed document root element, or null if parsing fails. | |
*/ | |
function parseHtmlRobust(htmlContent) { | |
try { | |
var htmlOutput = HtmlService.createHtmlOutput(htmlContent).getContent(); | |
var document = XmlService.parse(htmlOutput); | |
return document.getRootElement(); | |
} catch (e) { | |
Logger.log("Error parsing HTML using XmlService: " + e.toString()); | |
return null; | |
} | |
} | |
/** | |
* Cleans HTML content to fix invalid entities that can break XmlService parsing. | |
* | |
* @param {string} htmlContent Raw HTML string. | |
* @return {string} Cleaned HTML string. | |
*/ | |
function cleanHtmlContent(htmlContent) { | |
Logger.log("Cleaning HTML content to fix invalid entities."); | |
// Replace standalone '&' with '&' except known valid entities | |
return htmlContent.replace(/&(?!(amp;|lt;|gt;|quot;|apos;|#[0-9]+;|#[xX][0-9A-Fa-f]+;))/g, '&'); | |
} | |
/** | |
* Extracts content from a parsed document based on the requested component. | |
* | |
* @param {XmlService.Element} document Parsed HTML document root. | |
* @param {string} component Which element or meta to extract. | |
* @return {string} Extracted content or an empty string if not found. | |
*/ | |
function extractComponent(document, component) { | |
try { | |
var lc = component.toLowerCase(); | |
if (lc.startsWith("meta:")) { | |
var metaName = lc.split(":")[1]; | |
return getMetaContent(document, metaName); | |
} | |
switch (lc) { | |
case "title": | |
return getTagText(document, "title"); | |
case "h1": | |
case "h2": | |
case "h3": | |
case "h4": | |
case "h5": | |
case "h6": | |
case "p": | |
case "div": | |
case "li": | |
case "ul": | |
case "table": | |
return getAllTagText(document, lc); | |
case "img": | |
return getAllImageSrc(document); | |
default: | |
Logger.log("Unknown component: " + component + ". Trying as a tag."); | |
return getAllTagText(document, lc); | |
} | |
} catch (e) { | |
Logger.log("Error extracting component (" + component + "): " + e.toString()); | |
return ""; | |
} | |
} | |
/** | |
* Extracts content of a meta tag by the 'name' attribute. | |
* | |
* @param {XmlService.Element} document Parsed HTML document root. | |
* @param {string} metaName The meta name to look for (e.g. "description"). | |
* @return {string} The content attribute if found, otherwise an empty string. | |
*/ | |
function getMetaContent(document, metaName) { | |
var metas = document.getDescendants().filter(function(d) { | |
return d.getType() === XmlService.Element && d.getName() === 'meta'; | |
}).map(function(elem) { | |
return elem.asElement(); | |
}); | |
for (var i = 0; i < metas.length; i++) { | |
var nameAttr = metas[i].getAttribute('name'); | |
if (nameAttr && nameAttr.getValue().toLowerCase() === metaName) { | |
var contentAttr = metas[i].getAttribute('content'); | |
if (contentAttr) { | |
return contentAttr.getValue(); | |
} | |
} | |
} | |
return ""; | |
} | |
/** | |
* Returns the text content of the first occurrence of a specific tag. | |
* | |
* @param {XmlService.Element} document Parsed HTML document root. | |
* @param {string} tagName e.g. "title", "p". | |
* @return {string} The text of the tag or empty if not found. | |
*/ | |
function getTagText(document, tagName) { | |
var elements = document.getDescendants().filter(function(d) { | |
return d.getType() === XmlService.Element && d.getName().toLowerCase() === tagName; | |
}); | |
if (elements.length > 0) { | |
return elements[0].asElement().getText(); | |
} | |
return ""; | |
} | |
/** | |
* Returns the concatenated text content of all elements with a certain tag. | |
* | |
* @param {XmlService.Element} document Parsed HTML document root. | |
* @param {string} tagName e.g. "p", "h1", "div". | |
* @return {string} Joined text from all found tags. | |
*/ | |
function getAllTagText(document, tagName) { | |
var elements = document.getDescendants().filter(function(d) { | |
return d.getType() === XmlService.Element && d.getName().toLowerCase() === tagName; | |
}); | |
var texts = elements.map(function(e) { | |
return e.asElement().getText(); | |
}); | |
return texts.join("\n").trim(); | |
} | |
/** | |
* Returns all image src attributes from the parsed document. | |
* | |
* @param {XmlService.Element} document Parsed HTML document root. | |
* @return {string} All <img src="..."> URLs, one per line. | |
*/ | |
function getAllImageSrc(document) { | |
var elements = document.getDescendants().filter(function(d) { | |
return d.getType() === XmlService.Element && d.getName().toLowerCase() === "img"; | |
}); | |
var srcs = []; | |
for (var i = 0; i < elements.length; i++) { | |
var srcAttr = elements[i].asElement().getAttribute("src"); | |
if (srcAttr) { | |
srcs.push(srcAttr.getValue()); | |
} | |
} | |
return srcs.join("\n"); | |
} | |
/** | |
* Regex fallback for when XmlService parsing fails. | |
* Useful for simpler elements like <title>, headings, <p>, etc. | |
* | |
* @param {string} html Raw HTML string. | |
* @param {string} component The component name to extract. | |
* @return {string} Extracted text or an empty string if not found. | |
*/ | |
function regexFallback(html, component) { | |
Logger.log("Using regex fallback for component: " + component); | |
var lc = component.toLowerCase(); | |
// Fallback for <title> | |
if (lc === "title") { | |
var titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i); | |
return titleMatch && titleMatch[1] ? titleMatch[1].trim() : ""; | |
} | |
// Fallback for meta:xxx | |
if (lc.startsWith("meta:")) { | |
var metaName = lc.split(":")[1]; | |
var metaRegex = new RegExp('<meta[^>]*name=["\']' + metaName + '["\'][^>]*content=["\']([^"\']*)["\']', 'i'); | |
var metaMatch = html.match(metaRegex); | |
return metaMatch && metaMatch[1] ? metaMatch[1].trim() : ""; | |
} | |
// Headings, paragraphs, divs, lists, tables | |
if (["h1","h2","h3","h4","h5","h6","p","div","li","ul","table"].indexOf(lc) !== -1) { | |
var tagRegex = new RegExp("<" + lc + "[^>]*>([\\s\\S]*?)<\\/" + lc + ">", "gi"); | |
var matches = []; | |
var m; | |
while ((m = tagRegex.exec(html)) !== null) { | |
var textContent = m[1].replace(/<[^>]+>/g, '').trim(); | |
if (textContent) matches.push(textContent); | |
} | |
return matches.join("\n"); | |
} | |
// Fallback for img | |
if (lc === "img") { | |
var imgRegex = /<img[^>]+src=["']([^"']+)["']/gi; | |
var imgMatches = []; | |
var im; | |
while ((im = imgRegex.exec(html)) !== null) { | |
imgMatches.push(im[1]); | |
} | |
return imgMatches.join("\n"); | |
} | |
// If unknown component | |
return ""; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment