Skip to content

Instantly share code, notes, and snippets.

@mbaersch
Last active August 21, 2023 10:33
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mbaersch/117faa0513ebbcbeeeb8d2dbe83168f0 to your computer and use it in GitHub Desktop.
Save mbaersch/117faa0513ebbcbeeeb8d2dbe83168f0 to your computer and use it in GitHub Desktop.
JavaScript Variable für Google Tag Manager zur Ermittlung eines Bot Markers (zur Verwendung als Benutzerdefinierte Dimension)
function(){
//----------------------------------------------------------------
// Erkennung von Crawlern anhand User-Agent oder Feature-Detection
//----------------------------------------------------------------
//Soll ein einmal bestandender Test innerhalb der Session reichen? Dann hier einschalten
var cache2Session = true;
// --- Ende Setup ---
//Bei mehreren Aufrufen auf einer Seite das letzte Ergebnis wiederverwenden
if ((typeof(window.gtmCachedBotScore) !== 'undefined')) return window.gtmCachedBotScore;
//Bei aktivem Session-Caching vorhandenen Wert prüfen und zurückgeben
cache2Session = cache2Session && (window.sessionStorage != undefined);
if (cache2Session == true) {
var sessionCachedBotScore = sessionStorage.getItem("sessionCachedBotScore");
if (sessionCachedBotScore == "OK") return "OK";
}
//-----------------------------------------------------
// Prüfung auf User-Agents bekannter rendernder Crawler
//-----------------------------------------------------
var agentString = window.navigator.userAgent;
var bots_ua = new Array(
//user-agents for known rendering crawlers from search engines and other services
"Googlebot",
"AdsBot-Google",
"Mediapartners-Google",
"Google Search Console",
"Chrome-Lighthouse",
"DuckDuckBot",
"JobboerseBot",
"woobot",
"PingdomPageSpeed",
"PagePeeker",
"Refindbot",
"HubSpot",
"Yandex",
"Investment Crawler",
"BingPreview",
"Bingbot",
"bingbot",
"AdIdxBot",
"MicrosoftPreview",
"Baiduspider",
"Sogou",
"SISTRIX",
"facebookexternalhit",
"Site-Shot",
"wkhtmltoimage",
"SMTBot",
"PetalBot",
"AhrefsBot",
"avalex",
"RyteBot",
"SemrushBot",
"Cookiebot",
"Seekport Crawler",
"Cocolyzebot",
"Veoozbot",
"YisouSpider",
"Elisabot",
"ev-crawler",
"screeenly-bot",
"Cincraw",
"Applebot",
"headline.com",
"SeekportBot",
"BitSightBot",
"BrightEdge",
"Google-InspectionTool",
"Pumoxbot",
"INTL-UI-BOT",
"GPTBot"
);
var rs = "";
for (i=0;i<bots_ua.length;i++) {
var bt = bots_ua[i];
if (RegExp(bt).test(agentString)) {rs = bt; break;}
}
if (rs) {
rs = rs.split("|")[0].toUpperCase()+"_UA";
} else {
//Crawler, Spider, Bot im Agent String suchen
if (/crawler/.test(agentString.toLowerCase())) rs = "POTENTIAL_CRAWLER_UA";
else if (/spider/.test(agentString.toLowerCase())) rs = "POTENTIAL_SPIDER_UA";
else if (/bot/.test(agentString.toLowerCase()) &&
!/cubot/.test(agentString.toLowerCase())) rs = "POTENTIAL_BOT_UA";
}
//------------------------------------------------------------
// Headless Browser anhand User-Agent oder Merkmalen erkennen
//------------------------------------------------------------
//Genutzte Quellen:
// - https://github.com/antoinevastel/fpscanner/blob/master/src/fpScanner.js
// - https://github.com/infosimples/detect-headless
//Zu Wirksamkeit siehe: https://de.slideshare.net/SergeyShekyan/shekyan-zhang-owasp
function iFrameChrome() {
//Test kann nur durchgefuehrt werden, wenn schon ein body existiert, sonst per Rueckgabewert uebergehen
if (!document.body) return 'too_early';
var iframe = document.createElement('iframe');
iframe.srcdoc = 'blank page';
document.body.appendChild(iframe);
var result = typeof iframe.contentWindow.chrome;
iframe.remove();
return result;
}
if (rs == "") {
var canvas = document.createElement('canvas');
var gl = canvas.getContext('webgl');
var vendor = '';
var renderer = '';
try {
var canvas = document.createElement('canvas');
var gl = canvas.getContext('webgl');
var debugInfo = gl.getExtension('WEBGL_debug_renderer_info');
vendor = gl.getParameter(debugInfo.UNMASKED_VENDOR_WEBGL);
renderer = gl.getParameter(debugInfo.UNMASKED_RENDERER_WEBGL);
} catch (e) { }
var iScreenAvailWidth = window.screen.availWidth;
var iScreenAvailHeight = window.screen.availHeight;
var iScreenWidth = window.screen.width;
var iScreenHeight = window.screen.height;
// Querformat Test
if((iScreenAvailWidth > iScreenAvailHeight && iScreenWidth < iScreenHeight) || (iScreenAvailWidth < iScreenAvailHeight && iScreenWidth > iScreenHeight)) {
var iScreenAvailWidth = window.screen.availHeight;
var iScreenAvailHeight = window.screen.availWidth;
}
if (/HeadlessChrome/.test(agentString)) rs = "HEADCHR_UA";
else if (/MSIE 5.0/.test(agentString)) rs = "IE5_UA";
else if (/PhantomJS/.test(agentString)) rs = "PHANTOM_UA";
else if ((/Chrome/.test(agentString)) && (navigator.webdriver)) rs = "WEBDRIVER";
//Der folgende "iframe"-Test generiert "In App Browser-Treffer", die offenbar von FB und Instagram etc. stammen.
//User oder Preview? Unklar. Siehe https://firmhouse.com/blog/filtering-facebook-search-spiders-bots-and-other-automated-requests-fb_iab/
//Hier klingt es nach Bot, aber mir fehlt der Glaube an die Herleitung und Tests haben reale Besuche als false Positive enthalten,
//daher die Zusatzprüfung auf FB_IAB im User Agent
else if ((/Chrome/.test(agentString)) && (!/FB_IAB/.test(agentString)) && (iFrameChrome() === 'undefined')) rs = "HEADCHR_IFRAME";
else if (!/Trident|MSIE|Edge/.test(agentString) && (navigator.languages === "")) rs = "PHANTOM_LANGUAGE";
else if (window.external && window.external.toString && window.external.toString().indexOf('Sequentum') > -1) rs = "SEQUENTUM";
else if (iScreenAvailWidth > iScreenWidth && iScreenAvailHeight > iScreenHeight) rs = "PHANTOM_WINDOW_HEIGHT";
//Hier sind Fehltreffer nicht ausgeschlossen, da dieses Ergebnis sich innerhalb einer Sitzung auch mit "OK" abwechseln kann.
//Daher ist oben das Caching auf Sessionbasis vorgesehen, sobald alle Tests einmal mit "OK" bestanden wurden
else if ((window.outerHeight === 0) && (window.outerWidth === 0)) rs = "NO_OUTER_DIMENSION";
else if ((window.screen.height <= 100) && (window.screen.width <= 100)) rs = "DEVICE_TOO_SMALL";
else if (('_Selenium_IDE_Recorder' in window) || ('callSelenium' in window) || ('_selenium' in window) ||
('__webdriver_script_fn' in document) || ('__driver_evaluate' in document) ||
('__webdriver_evaluate' in document) || ('__selenium_evaluate' in document) || ('__fxdriver_evaluate' in document) ||
('__driver_unwrapped' in document) || ('__webdriver_unwrapped' in document) ||
('__selenium_unwrapped' in document) || ('__fxdriver_unwrapped' in document) ||
('__webdriver_script_func' in document) || (document.documentElement.getAttribute("selenium") !== null) ||
(document.documentElement.getAttribute("webdriver") !== null) ||
(document.documentElement.getAttribute("driver") !== null)) rs = "SELENIUM_DRIVER";
else if (('callPhantom' in window) || ('_phantom' in window) || ('phantom' in window)) rs = "PHANTOM_PROPERTIES";
else if ('__stopAllTimers' in window) rs = "JSDOM_PROPERTIES";
else if ((/Firefox/.test(agentString)) && (!/Seamonkey/.test(agentString)) && (window.mozPaintCount === 'undefined')) rs = "FF_NO_PAINTCOUNT";
else if(vendor == "Brian Paul" && renderer == "Mesa OffScreen") rs = "HEADCHR_WEBGL";
else rs = "OK";
}
window.gtmCachedBotScore = rs;
if ((cache2Session == true) && (rs == "OK")) sessionStorage.setItem("sessionCachedBotScore", "OK");
return rs;
}
@Pazekal90
Copy link

Wir haben die Zeile bei unseren Kunden jetzt entfernt und funktioniert gut.

@mbaersch
Copy link
Author

Bot List Update, jetzt wieder synchron zu https://github.com/mbaersch/simple-bot-detector-web

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment