Skip to content

Instantly share code, notes, and snippets.

@nickali
Last active June 28, 2024 06:07
Show Gist options
  • Save nickali/1ab1ff7ccacd1df531d4bf89a771f071 to your computer and use it in GitHub Desktop.
Save nickali/1ab1ff7ccacd1df531d4bf89a771f071 to your computer and use it in GitHub Desktop.
Block AI and SEO bots, scrapers, spiders, and crawlers with Cloudflare Pages Functions
/*
Cloudflare provides the ability to block some AI crawlers and
scrapers. They also maintain a list of bots approved to scan
sites: https://radar.cloudflare.com/traffic/verified-bots.
There are plenty of sites on the verified list I would like to block,
plus other SEO crawlers.
If you are hosting a site on Cloudflare Pages, here is a function
that will block a more expansive number of services. It checks the
user agent against a list and if there is a match, it returns a 404.
Otherwise, it delivers whatever file was requested.
Note, this will run on every single request. Cloudflare's free plan
allows for 100,000 free function and worker requests (total) everyday.
Create a directory called functions at the root of your repository.
Then create a file called _middleware.js in the functions directory
and stick this code in there.
blockedStrings is the array of strings the user agent is being
compared against. The list of user agents were gathered from a
whole bunch of different places. Feel free to modify to suit your needs.
I'm also open to suggestions on how to make this faster. But anecdotally,
I haven't noticed any issues with speed.
*/
const blockRequest = async (context) => {
let user_agent = context.request.headers.get("user-agent");
let returnResponse;
const url = new URL(context.request.url);
const blockedUserAgents = ["01h4x.com", "360Spider", "404checker", "404enemy", "80legs", "ADmantX", "AIBOT", "ALittle Client", "ASPSeek", "Abonti", "Aboundex", "Aboundexbot", "Acunetix", "AdsBot-Google", "AdsBot-Google-Mobile", "AdsTxtCrawlerTP", "AfD-Verbotsverfahren", "AhrefsBot", "AiHitBot", "Aipbot", "Alexibot", "AllSubmitter", "Alligator", "AlphaBot", "Amazonbot", "Anarchie", "Anarchy", "Anarchy99", "Ankit", "Anthill", "anthropic-ai", "Apexoo", "Aspiegel", "Asterias", "Attach", "AwarioBot", "AwarioRssBot", "AwarioSmartBot", "BBBike", "BDCbot", "BDFetch", "BLEXBot", "BackDoorBot", "BackStreet", "BackWeb", "Backlink-Ceck", "BacklinkCrawler", "Badass", "Bandit", "Barkrowler", "BatchFTP", "Battleztar Bazinga", "BetaBot", "Bigfoot", "Bitacle", "Black Hole", "BlackWidow", "Blackboard", "Blow", "BlowFish", "Boardreader", "Bolt", "BotALot", "Botify", "Brandprotect", "Brandwatch", "Buck", "Buddy", "BuiltBotTough", "BuiltWith", "Bullseye", "BunnySlippers", "BuzzSumo", "Bytespider", "CATExplorador", "CCBot", "CODE87", "CSHttp", "Calculon", "CazoodleBot", "Cegbfeieh", "CensysInspect", "ChatGPT-User", "CheTeam", "CheeseBot", "CherryPicker", "ChinaClaw", "Chlooe", "Citoid", "Claritybot", "Claude-Web", "ClaudeBot", "Cliqzbot", "Cloud mapping", "Cocolyzebot", "Cogentbot", "Collector", "Copier", "CopyRightCheck", "Copyscape", "Cosmos", "Craftbot", "Crawling at Home Project", "CrazyWebCrawler", "Crescent", "CriteoBot", "CrunchBot", "Curious", "Custo", "CyotekWebCopy", "DBLBot", "DIIbot", "DSearch", "DTS Agent", "DataCha0s", "DataForSeoBot", "DatabaseDriverMysqli", "Demon", "Deusu", "Devil", "Diffbot", "Digincore", "DigitalPebble", "Dirbuster", "Disco", "Discobot", "Discoverybot", "Dispatch", "DittoSpyder", "DnBCrawler-Analytics", "DnyzBot", "DomCopBot", "DomainAppender", "DomainCrawler", "DomainSigmaCrawler", "DomainStatsBot", "Domains Project", "Dotbot", "Download Wonder", "Dragonfly", "Drip", "ECCP/1.0", "EMail Siphon", "EMail Wolf", "EasyDL", "Ebingbong", "Ecxi", "EirGrabber", "EroCrawler", "Evil", "Exabot", "Express WebPictures", "ExtLinksBot", "Extractor", "ExtractorPro", "Extreme Picture Finder", "EyeNetIE", "Ezooms", "FDM", "FHscan", "FacebookBot", "FemtosearchBot", "Fimap", "Firefox/7.0", "FlashGet", "Flunky", "Foobot", "Freeuploader", "FrontPage", "Fuzz", "FyberSpider", "Fyrebot", "G-i-g-a-b-o-t", "GPTBot", "GT::WWW", "GalaxyBot", "Genieo", "GermCrawler", "GetRight", "GetWeb", "Getintent", "Gigabot", "Go!Zilla", "Go-Ahead-Got-It", "GoZilla", "Google-Extended", "GoogleOther", "Googlebot-Image", "Gotit", "GrabNet", "Grabber", "Grafula", "GrapeFX", "GrapeshotCrawler", "GridBot", "HEADMasterSEO", "HMView", "HTMLparser", "HTTP::Lite", "HTTrack", "Haansoft", "HaosouSpider", "Harvest", "Havij", "Heritrix", "Hloader", "HonoluluBot", "Humanlinks", "HybridBot", "IDBTE4M", "IDBot", "IRLbot", "Iblog", "Id-search", "IlseBot", "Image Fetch", "Image Sucker", "ImagesiftBot", "IndeedBot", "Indy Library", "InfoNaviRobot", "InfoTekies", "Information Security Team InfraSec Scanner", "InfraSec Scanner", "Intelliseek", "InterGET", "InternetMeasurement", "InternetSeer", "Internet Ninja", "Iria", "Iskanie", "IstellaBot", "JOC Web Spider", "JamesBOT", "Jbrofuzz", "JennyBot", "JetCar", "Jetty", "JikeSpider", "Joomla", "Jorgee", "JustView", "Jyxobot", "Kenjin Spider", "Keybot Translation-Search-Machine", "Keyword Density", "Kinza", "Kozmosbot", "LNSpiderguy", "LWP::Simple", "Lanshanbot", "Larbin", "Leap", "LeechFTP", "LeechGet", "LexiBot", "Lftp", "LibWeb", "Libwhisker", "LieBaoFast", "Lightspeedsystems", "Likse", "LinkScan", "LinkWalker", "Linkbot", "LinkextractorPro", "LinkpadBot", "LinksManager", "LinqiaMetadataDownloaderBot", "LinqiaRSSBot", "LinqiaScrapeBot", "Lipperhey", "Lipperhey Spider", "Litemage_walker", "Lmspider", "Ltx71", "MFC_Tear_Sample", "MIDown tool", "MIIxpc", "MJ12bot", "MQQBrowser", "MSFrontPage", "MSIECrawler", "MSNBot-Media", "AdIdxBot", "MTRobot", "Mag-Net", "Magnet", "Mail.RU_Bot", "Majestic SEO", "Majestic-SEO", "Majestic12", "MarkMonitor", "MarkWatch", "Mass Downloader", "Masscan", "Mata Hari", "MauiBot", "Mb2345Browser", "MeanPath Bot", "Meanpathbot", "Mediatoolkitbot", "Mediapartners-Google", "MegaIndex.ru", "Metauri", "MicroMessenger", "Microsoft Data Access", "Microsoft URL Control", "Minefield", "Mister PiX", "Moblie Safari", "Mojolicious", "MolokaiBot", "Morfeus Fucking Scanner", "Mozlila", "Mr.4x3", "Msrabot", "Musobot", "NICErsPRO", "NPbot", "Name Intelligence", "Nameprotect", "Navroad", "NearSite", "Needle", "Nessus", "NetAnts", "NetLyzer", "NetMechanic", "NetSpider", "NetZIP", "Net Vampire", "Netcraft", "Nettrack", "Netvibes", "NextGenSearchBot", "Nibbler", "Niki-bot", "Nikto", "NimbleCrawler", "Nimbostratus", "Ninja", "Nmap", "Nuclei", "Nutch", "Octopus", "Offline Explorer", "Offline Navigator", "Omgilibot", "OnCrawl", "OpenLinkProfiler", "OpenVAS", "Openfind", "Openvas", "OrangeBot", "OrangeSpider", "OutclicksBot", "OutfoxBot", "PECL::HTTP", "PHPCrawl", "POE-Component-Client-HTTP", "Page Analyzer", "PageGrabber", "PageScorer", "PageThing.com", "PageAnalyzer", "Pandalytics", "Panscient", "Papa Foto", "Pavuk", "PeoplePal", "PerplexityBot", "PiplBot", "Pi-Monster", "Picscout", "Picsearch", "PictureFinder", "Piepmatz", "Pimonster", "Pixray", "PleaseCrawl", "Pockey", "ProPowerBot", "ProWebWalker", "Probethenet", "Proximic", "Psbot", "Pu_iN", "Pump", "PxBroker", "PyCurl", "QueryN Metasearch", "Quick-Crawler", "RSSingBot", "Rainbot", "RankActive", "RankActiveLinkBot", "RankFlex", "RankingBot", "RankingBot2", "Rankivabot", "RankurBot", "Re-re", "ReGet", "RealDownload", "Reaper", "RebelMouse", "Recorder", "RedesScrapy", "RepoMonkey", "Ripper", "RocketCrawler", "Rogerbot", "SBIder", "SEOkicks", "SEOkicks-Robot", "SEOlyticsCrawler", "SEOprofiler", "SEOstats", "SISTRIX", "SMTBot", "SalesIntelligent", "ScanAlert", "Scanbot", "ScoutJet", "Scrapy", "Screaming", "Screaming Frog SEO Spider", "ScreenerBot", "ScrepyBot", "Searchestate", "SearchmetricsBot", "Seekport", "SeekportBot", "SemanticJuice", "Semrush", "SiteAuditBot", "SemrushBot-BA", "SemrushBot-SI", "SemrushBot-SWA", "SemrushBot-CT", "SplitSignalBot", "SemrushBot-COUB", "SemrushBot", "SentiBot", "SenutoBot", "SeoSiteCheckup", "SeobilityBot", "Seomoz", "Shodan", "Siphon", "SiteCheckerBotCrawler", "SiteExplorer", "SiteLockSpider", "SiteSnagger", "SiteSucker", "Site Sucker", "Sitebeam", "Siteimprove", "Sitevigil", "SlySearch", "SmartDownload", "Snake", "Snapbot", "Snoopy", "SocialRankIOBot", "Sociscraper", "Sogou web spider", "Sosospider", "Sottopop", "SpaceBison", "Spammen", "SpankBot", "Spanner", "Spbot", "Spinn3r", "SputnikBot", "Sqlmap", "Sqlworm", "Sqworm", "Steeler", "Storebot-Google", "Stripper", "Sucker", "Sucuri", "SuperBot", "SuperHTTP", "Surfbot", "SurveyBot", "Suzuran", "Swiftbot", "Szukacz", "T0PHackTeam", "T8Abot", "TechnicalSEOdotCom", "Teleport", "TeleportPro", "Telesoft", "Telesphoreo", "Telesphorep", "TheNomad", "The Intraformant", "Thumbor", "TightTwatBot", "TinyTestBot", "Titan", "Toata", "Toweyabot", "Tracemyfile", "Trendiction", "Trendictionbot", "True_Robot", "Turingos", "Turnitin", "TurnitinBot", "TwengaBot", "Twice", "Typhoeus", "URLy.Warning", "URLy Warning", "UnisterBot", "Upflow", "V-BOT", "VB Project", "VCI", "Vacuum", "Vagabondo", "VelenPublicWebCrawler", "VeriCiteCrawler", "VidibleScraper", "Virusdie", "VoidEYE", "Voil", "Voltron", "WASALive-Bot", "WBSearchBot", "WEBDAV", "WISENutbot", "WPScan", "WWW-Collector-E", "WWW-Mechanize", "WWW::Mechanize", "WWWOFFLE", "Wallpapers", "Wallpapers/3.0", "WallpapersHD", "WeSEE", "WebAuto", "WebBandit", "WebCollage", "WebCopier", "WebEnhancer", "WebFetch", "WebFuck", "WebGo IS", "WebImageCollector", "WebLeacher", "WebPix", "WebReaper", "WebSauger", "WebStripper", "WebSucker", "WebWhacker", "WebZIP", "Web Auto", "Web Collage", "Web Enhancer", "Web Fetch", "Web Fuck", "Web Pix", "Web Sauger", "Web Sucker", "Webalta", "WebmasterWorldForumBot", "Webshag", "WebsiteExtractor", "WebsiteQuester", "Website Quester", "Webster", "Whack", "Whacker", "Whatweb", "Who.is Bot", "Widow", "WinHTTrack", "WiseGuys Robot", "Wonderbot", "Woobot", "Wotbox", "Wprecon", "Xaldon WebSpider", "Xaldon_WebSpider", "Xenu", "YaK", "YouBot", "YoudaoBot", "Zade", "Zauba", "Zermelo", "Zeus", "Zitebot", "ZmEu", "ZoomBot", "ZoominfoBot", "ZumBot", "ZyBorg", "adscanner", "anthropic-ai", "arquivo-web-crawler", "arquivo.pt", "autoemailspider", "awario.com", "backlink-check", "cah.io.community", "check1.exe", "clark-crawler", "coccocbot", "cognitiveseo", "cohere-ai", "com.plumanalytics", "crawl.sogou.com", "crawler.feedback", "crawler4j", "dataforseo.com", "dataforseobot", "demandbase-bot", "domainsproject.org", "eCatch", "evc-batch", "facebookscraper", "gopher", "heritrix", "ia_archiver", "imagesift.com", "instabid", "internetVista monitor", "ips-agent", "isitwp.com", "iubenda-radar", "linkdexbot", "linkfluence", "lwp-request", "lwp-trivial", "magpie-crawler", "meanpathbot", "mediawords", "muhstik-scan", "netEstate NE Crawler", "oBot", "omgili", "openai", "openai.com", "page scorer", "pcBrowser", "peer39_crawler", "peer39_crawler/1.0", "plumanalytics", "polaris version", "probe-image-size", "ripz", "s1z.ru", "satoristudio.net", "scalaj-http", "scan.lol", "seobility", "seocompany.store", "seoscanners", "seostar", "serpstatbot", "sexsearcher", "sitechecker.pro", "siteripz", "sogouspider", "sp_auditbot", "spyfu", "sysscan", "tAkeOut", "trendiction.com", "trendiction.de", "ubermetrics-technologies.com", "voyagerx.com", "webgains-bot", "webmeup-crawler", "webpros.com", "webprosbot", "x09Mozilla", "x22Mozilla", "xpymep1.exe", "zauba.io", "zgrab"];
if (blockedUserAgents.some(v => user_agent.includes(v))) {
returnResponse = new Response('Not found', { status: 404 });
} else {
const asset = await context.env.ASSETS.fetch(url);
returnResponse = new Response(asset.body, asset);
}
return returnResponse;
};
export const onRequest = [blockRequest]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment