Skip to content

Instantly share code, notes, and snippets.

@Jerska
Last active April 30, 2020 15:25
Show Gist options
  • Save Jerska/f3aa5a7509625bb0c1671b687d861964 to your computer and use it in GitHub Desktop.
Save Jerska/f3aa5a7509625bb0c1671b687d861964 to your computer and use it in GitHub Desktop.
new Crawler({
appId: "FIXME",
apiKey: "FIXME",
indexPrefix: "crawler_FIXME_",
rateLimit: 4,
maxUrls: 100,
startUrls: ["FIXME"],
ignoreQueryParams: ["utm_medium", "utm_source", "utm_campaign", "utm_term"],
actions: [
{
indexName: "all",
pathsToMatch: ["https://FIXME/**"],
recordExtractor: ({ url, $, contentLength, fileType, helpers }) => {
// Configuration documentation: https://www.algolia.com/doc/api-reference/crawler/
// Extracting data documentation: https://www.algolia.com/doc/tools/crawler/guides/extracting-data/
/* 1. Helpers */
/**
* Record promotion level.
* Change this variable value to boost the ranking of the next added records
*/
let promote = 0;
/**
* Record current hierarchy level
* Change this variable value to change the hierarchy of the next added records
*/
let hierarchy = [url.host, url.pathname];
/**
* Transform a hierarchy array into a hierarchy object
* @example
* buildHierarchyObject(['https://www.example.org', 'Questions'])
* // => { lvl0: 'www.example.org', lvl1: 'www.example.org > Questions' }
*/
function buildHierarchyObj() {
const res = {};
for (let i = 0; i < hierarchy.length; ++i) {
res[`lvl${i}`] = hierarchy.slice(0, i + 1).join(" > ");
}
return res;
}
/**
* Take the first truthy value in a list of args.
* Calls any function passed to use its return value instead, and skips on throw.
* @example
* pickWithFallback(
* $('title').text(),
* () => $('meta[name="og:title"]').attr('content').trim()
* )
*/
function pickWithFallback(...values) {
for (let val of values) {
if (typeof val === "function") {
try {
val = val();
} catch (_e) {
continue;
}
}
if (val) return val;
}
return null;
}
/**
* Clean whitespaces in a chunk of text.
*/
function cleanWhitespaces(str) {
if (str === null) return str;
return String(str).replace(/\s+/gm, " ").trim();
}
/**
* Remove some useless nodes for crawling from the page.
* This is usually useful to clean up `content`.
*/
function removeSelectors(...selectors) {
for (let selector of selectors) {
$(selector).remove();
}
}
let records = []; // Final records array
let position = 1; // Position in the page
/**
* Add record to records list
*/
function addRecord(attributes) {
records.push({
objectID: `${url.href} ${position}`,
url: url.href,
hierarchy,
hierarchyObj: buildHierarchyObj(),
promote,
urlDepth: url.pathname.split("/").length,
position: position++,
...attributes,
});
}
/* 2. Extraction */
console.log(`Crawling "${url.href}"`);
// Remove useless DOM nodes
removeSelectors("header", "footer", "nav");
addRecord({
title: cleanWhitespaces(
pickWithFallback(
$('meta[property="og:title"]').attr("content"),
$("head > title").text(),
() => $("h1").first().text(),
"No title"
)
),
keywords: pickWithFallback(
() =>
$("meta[name=keywords]")
.attr("content")
.split(",")
.map(cleanWhitespaces),
[]
),
content: cleanWhitespaces(
pickWithFallback(
$("meta[name=description]").attr("content"),
$("body").text(),
""
)
).slice(0, 10000),
});
return records;
},
},
],
initialIndexSettings: {
all: {
searchableAttributes: [
"unordered(keywords)",
"unordered(title)",
"unordered(hierarchy)",
"unordered(content)",
"url",
],
customRanking: ["desc(promote)", "asc(urlDepth)", "asc(position)"],
attributesForFaceting: ["hierarchyObj"],
attributesToHighlight: ["url", "title", "keywords", "hierarchy"],
attributesToSnippet: ["content"],
},
},
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment