Skip to content

Instantly share code, notes, and snippets.

@jiaming0708
Created March 27, 2019 03:20
Show Gist options
  • Save jiaming0708/bce07678163eaa0fc2bf51e880b24c03 to your computer and use it in GitHub Desktop.
Save jiaming0708/bce07678163eaa0fc2bf51e880b24c03 to your computer and use it in GitHub Desktop.
detect is crawler
[
".*Java.*outbrain",
"008\\/",
"192\\.comAgent",
"2ip\\.ru",
"404checker",
"^bluefish ",
"^Calypso v\\/",
"^COMODO DCV",
"^DangDang",
"^DavClnt",
"^FDM ",
"^Grabber",
"^git\\/",
"^Goose\\/",
"^HTTPClient\\/",
"^Java\\/",
"^Jeode\\/",
"^Jetty\\/",
"^Mget",
"^Microsoft URL Control",
"^NG\\/[0-9\\.]",
"^NING\\/",
"^PHP\\/[0-9]",
"^RMA\\/",
"^Ruby|Ruby\\/[0-9]",
"^scrutiny\\/",
"^VSE\\/[0-9]",
"^WordPress\\.com",
"^XRL\\/[0-9]",
"^ZmEu",
"a3logics\\.in",
"A6-Indexer",
"a\\.pr-cy\\.ru",
"Abonti\\/",
"Aboundex",
"aboutthedomain",
"Accoona-AI-Agent",
"acoon",
"acrylicapps\\.com\\/pulp",
"adbeat",
"AddThis",
"ADmantX",
"adressendeutschland",
"adscanner\\/",
"Advanced Email Extractor v",
"agentslug",
"AHC",
"aihit",
"aiohttp\\/",
"Airmail",
"akka-http\\/",
"akula\\/",
"alertra",
"alexa site audit",
"Alibaba\\.Security\\.Heimdall",
"allloadin\\.com",
"alyze\\.info",
"amagit",
"AndroidDownloadManager",
"Anemone",
"AngleSharp\\/",
"Ant\\.com",
"Anturis Agent",
"AnyEvent-HTTP\\/",
"Apache Droid",
"ApacheBench\\/",
"Apache-HttpAsyncClient\\/",
"Apache-HttpClient\\/",
"APIs-Google",
"AportWorm\\/[0-9]",
"AppBeat\\/[0-9]",
"AppEngine-Google",
"Arachmo",
"arachnode",
"Arachnophilia",
"aria2",
"asafaweb.com",
"AskQuickly",
"Astute",
"asynchttp",
"autocite",
"Autonomy",
"axios\\/",
"B-l-i-t-z-B-O-T",
"^b0t$",
"Backlink-Ceck\\.de",
"Bad-Neighborhood",
"baidu\\.com",
"baypup\\/[0-9]",
"baypup\\/colbert",
"BazQux",
"BCKLINKS",
"BDFetch",
"BegunAdvertising\\/",
"BigBozz",
"biglotron",
"BingLocalSearch",
"BingPreview",
"binlar",
"biNu image cacher",
"biz_Directory",
"Blackboard Safeassign",
"Bloglovin",
"BlogPulseLive",
"BlogSearch",
"Blogtrottr",
"boitho\\.com-dc",
"BPImageWalker",
"Braintree-Webhooks",
"Branch Metrics API",
"Branch-Passthrough",
"Brodie\\/",
"Browsershots",
"BUbiNG",
"Buck\\/",
"Burf Search",
"Butterfly\\/",
"BuzzSumo",
"CAAM\\/[0-9]",
"CakePHP",
"CapsuleChecker",
"CaretNail",
"catexplorador",
"cb crawl",
"CC Metadata Scaper",
"Cerberian Drtrs",
"CERT\\.at-Statistics-Survey",
"cg-eye",
"changedetection",
"ChangesMeter\\/",
"Charlotte",
"CheckHost",
"checkprivacy",
"chkme\\.com",
"Chirp\\/[0-9]",
"CirrusExplorer\\/",
"CISPA Vulnerability Notification",
"Citoid",
"CJNetworkQuality",
"Clarsentia",
"clips\\.ua\\.ac\\.be",
"Cloud mapping experiment",
"CloudEndure",
"CloudFlare-AlwaysOnline",
"Cloudinary\\/[0-9]",
"cmcm\\.com",
"coccoc",
"colly -",
"CommaFeed",
"Commons-HttpClient",
"Comodo SSL Checker",
"contactbigdatafr",
"convera",
"copyright sheriff",
"Covario-IDS",
"CrawlForMe\\/[0-9]",
"cron-job\\.org",
"Crowsnest",
"curb",
"Curious George",
"curl",
"cuwhois\\/[0-9]",
"cybo\\.com",
"DareBoost",
"help@dataminr\\.com",
"DataparkSearch",
"dataprovider",
"Daum(oa)?[ \\/][0-9]",
"DeuSu",
"developers\\.google\\.com\\/\\+\\/web\\/snippet\\/",
"Digg",
"Dispatch\\/",
"dlvr",
"DMBrowser",
"DNS-Tools Header-Analyzer",
"DNSPod-reporting",
"docoloc",
"Dolphin http client\\/",
"DomainAppender",
"Donuts Content Explorer",
"dotMailer content retrieval",
"dotSemantic",
"downforeveryoneorjustme",
"downnotifier\\.com",
"DowntimeDetector",
"Dragonfly File Reader",
"drupact",
"Drupal \\(\\+http:\\/\\/drupal\\.org\\/\\)",
"dubaiindex",
"EARTHCOM",
"Easy-Thumb",
"ec2linkfinder",
"eCairn-Grabber",
"ECCP",
"echocrawl",
"eContext\\/",
"ElectricMonk",
"elefent",
"EMail Exractor",
"Email%20Extractor%20Lite",
"EmailWolf",
"Embed PHP Library",
"Embedly",
"europarchive\\.org",
"evc-batch\\/[0-9]",
"EventMachine HttpClient",
"Evidon",
"Evrinid",
"ExactSearch",
"ExaleadCloudview",
"Excel\\/",
"Exif Viewer",
"ExperianCrawlUK",
"Exploratodo",
"Express WebPictures",
"ezooms",
"facebookexternalhit",
"facebookplatform",
"fairshare",
"Faraday v",
"fasthttp",
"Faveeo",
"Favicon downloader",
"FavOrg",
"Feed Wrangler",
"Feedbin",
"FeedBooster",
"FeedBucket",
"FeedBunch\\/[0-9]",
"FeedBurner",
"FeedChecker",
"Feedly",
"Feedspot",
"Feedwind\\/[0-9]",
"feeltiptop",
"Fetch API",
"Fetch\\/[0-9]",
"Fever\\/[0-9]",
"findlink",
"findthatfile",
"FlipboardBrowserProxy",
"FlipboardProxy",
"FlipboardRSS",
"fluffy",
"flynxapp",
"forensiq",
"FoundSeoTool\\/[0-9]",
"free thumbnails",
"FreeWebMonitoring SiteChecker",
"Funnelback",
"G-i-g-a-b-o-t",
"g00g1e\\.net",
"GAChecker",
"ganarvisitas\\/[0-9]",
"geek-tools",
"Genderanalyzer",
"Genieo",
"GentleSource",
"GetLinkInfo",
"getprismatic\\.com",
"GetURLInfo\\/[0-9]",
"Ghost Inspector",
"GigablastOpenSource",
"GIS-LABS",
"github-camo",
"github\\.com\\/",
"Go [\\d\\.]* package http",
"Go-http-client",
"Go http package",
"gobyus",
"gofetch",
"GomezAgent",
"gooblog",
"Goodzer\\/[0-9]",
"Google favicon",
"Google Keyword Suggestion",
"Google Keyword Tool",
"Google Page Speed Insights",
"Google PP Default",
"Google Search Console",
"Google Web Preview",
"Google-Adwords",
"Google-Apps-Script",
"Google-Calendar-Importer",
"Google-HotelAdsVerifier",
"Google-HTTP-Java-Client",
"Google-Publisher-Plugin",
"Google-SearchByImage",
"Google-Site-Verification",
"Google-Structured-Data-Testing-Tool",
"Google-Youtube-Links",
"google_partner_monitoring",
"GoogleDocs",
"GoogleHC\\/",
"GoogleProducer",
"Gookey",
"GoScraper",
"GoSpotCheck",
"GoSquared-Status-Checker",
"gosquared-thumbnailer",
"grabify",
"Grammarly",
"grouphigh",
"grokkit",
"grub-client",
"gSOAP\\/",
"GTmetrix",
"GuzzleHttp",
"gvfs\\/",
"HAA(A)?RTLAND http client",
"hackney\\/",
"Hatena",
"hawkReader",
"HEADMasterSEO",
"HeartRails_Capture",
"heritrix",
"historious\\/",
"hledejLevne\\.cz\\/[0-9]",
"Holmes",
"HonesoSearchEngine\\/",
"HootSuite Image proxy",
"Hootsuite-WebFeed\\/[0-9]",
"HostTracker",
"ht:\\/\\/check",
"htdig",
"HTMLParser\\/",
"http-get",
"HTTP-Header-Abfrage",
"http-kit",
"http-request\\/",
"HTTP-Tiny",
"HTTP_Compression_Test",
"http_request2",
"http_requester",
"HttpComponents",
"httphr",
"HTTPMon",
"PEAR HTTPRequest",
"http\\.rb\\/",
"httpscheck",
"httpssites_power",
"httpunit",
"HttpUrlConnection",
"httrack",
"hosterstats",
"huaweisymantec",
"HubPages.*crawlingpolicy",
"HubSpot ",
"HyperZbozi.cz Feeder",
"i2kconnect\\/",
"ichiro",
"IdeelaborPlagiaat",
"IDG Twitter Links Resolver",
"IDwhois\\/[0-9]",
"Iframely",
"igdeSpyder",
"IlTrovatore",
"ImageEngine\\/",
"Imagga",
"imgsizer",
"InAGist",
"inbound\\.li parser",
"InDesign%20CC",
"infegy",
"infohelfer",
"InfoWizards Reciprocal Link System PRO",
"Instapaper",
"inpwrd\\.com",
"Integrity",
"integromedb",
"internet_archive",
"InternetSeer",
"internetVista monitor",
"intraVnews",
"IODC",
"IOI",
"iplabel",
"IPS\\/[0-9]",
"ips-agent",
"IPWorks HTTP\\/S Component",
"iqdb\\/",
"Irokez",
"isitup\\.org",
"iskanie",
"iZSearch",
"janforman",
"Jaunt\\/",
"Jersey\\/",
"Jigsaw",
"Jobboerse",
"JobFeed discovery",
"Jobg8 URL Monitor",
"jobo",
"Jobrapido",
"Jobsearch1\\.5",
"JoinVision Generic",
"JS-Kit",
"Kaspersky Lab CFR link resolver",
"KeepRight OpenStreetMap Checker",
"Kelny\\/",
"Kerrigan\\/",
"KeyCDN",
"Keyword Extractor",
"Keywords Research",
"KickFire",
"KimonoLabs\\/",
"Kml-Google",
"knows\\.is",
"KOCMOHABT",
"kouio",
"kulturarw3",
"KumKie",
"L\\.webis",
"Larbin",
"Lavf\\/",
"LayeredExtractor",
"letsencrypt",
"LibVLC",
"libwww",
"Licorne Image Snapshot",
"Liferea\\/",
"link checker",
"Link Valet",
"link_thumbnailer",
"LinkAlarm\\/",
"linkCheck",
"linkdex",
"LinkExaminer",
"linkfluence",
"linkpeek",
"LinkPreviewGenerator",
"LinkTiger",
"LinkWalker",
"Lipperhey",
"livedoor ScreenShot",
"LoadImpactPageAnalyzer",
"LoadImpactRload",
"LongURL API",
"looksystems\\.net",
"ltx71",
"lua-resty-http",
"lwp-trivial",
"lycos",
"LYT\\.SR",
"mabontland",
"MagpieRSS",
"Mail.Ru",
"MailChimp",
"makecontact\\/",
"Mandrill",
"MapperCmd",
"marketinggrader",
"masscan\\/[0-9]",
"Mediapartners-Google",
"MegaIndex\\.ru",
"Melvil Rawi\\/",
"MergeFlow-PageReader",
"Metaspinner",
"MetaURI",
"Microsearch",
"Microsoft-WebDAV-MiniRedir",
"Microsoft Data Access Internet Publishing Provider Protocol",
"Microsoft Office ",
"Microsoft Windows Network Diagnostics",
"Miniature.io\\/",
"Mindjet",
"Miniflux",
"mixdata dot com",
"mixed-content-scan",
"mixnode",
"Mnogosearch",
"mogimogi",
"Mojolicious \\(Perl\\)",
"monitis",
"Monitority\\/[0-9]",
"montastic",
"MonTools",
"Moreover",
"Morning Paper",
"mowser",
"MovableType",
"Mrcgiguy",
"mShots",
"MxToolbox\\/",
"MuckRack\\/",
"MVAClient",
"nagios",
"Najdi\\.si\\/",
"Needle\\/",
"NETCRAFT",
"NetLyzer FastProbe",
"Netpursual",
"netresearch",
"NetShelter ContentScan",
"Netsparker",
"NetTrack",
"Netvibes",
"Neustar WPM",
"NeutrinoAPI",
"NewRelicPinger\\/1.0 \\(\\d+\\)",
"NewsBlur .*Finder",
"NewsGator",
"newsme",
"newspaper\\/",
"Nexgate Ruby Client",
"NG-Search",
"nineconnections\\.com",
"NLNZ_IAHarvester",
"Nmap Scripting Engine",
"node-superagent",
"node-urllib\\/",
"node\\.io",
"nominet\\.org\\.uk",
"Norton-Safeweb",
"Notifixious",
"notifyninja",
"nuhk",
"nutch",
"Nuzzel",
"nWormFeedFinder",
"Nymesis",
"Ocelli\\/[0-9]",
"oegp",
"Offline Explorer",
"okhttp",
"Omea Reader",
"omgili",
"OMSC",
"Online Domain Tools",
"OpenCalaisSemanticProxy",
"Openstat\\/",
"OpenVAS",
"Optimizer",
"Orbiter",
"OrgProbe\\/[0-9]",
"orion-semantics",
"Owler",
"ow\\.ly",
"ownCloud News",
"OxfordCloudService\\/[0-9]",
"Page Analyzer",
"Page Valet",
"page2rss",
"page_verifier",
"PagePeeker",
"Pagespeed\\/[0-9]",
"Panopta",
"panscient",
"parsijoo",
"PayPal IPN",
"Pcore-HTTP",
"Pearltrees",
"peerindex",
"Peew",
"Perlu -",
"PhantomJS\\/",
"PhantomJS Screenshoter",
"Photon\\/",
"phpcrawl",
"phpservermon",
"Pi-Monster",
"ping\\.blo\\.gs\\/",
"Pingability",
"Pingdom",
"Pingoscope",
"PingSpot",
"pinterest\\.com",
"Pizilla",
"Ploetz \\+ Zeller",
"Plukkie",
"PocketParser",
"POE-Component-Client-HTTP\\/",
"Pompos",
"Porkbun",
"Port Monitor",
"postano",
"PostmanRuntime\\/",
"PostPost",
"postrank",
"PowerPoint\\/",
"Priceonomics Analysis Engine",
"PritTorrent\\/[0-9]",
"PrintFriendly\\.com",
"Prlog",
"probethenet",
"Project 25499",
"Promotion_Tools_www.searchenginepromotionhelp.com",
"prospectb2b",
"Protopage",
"proximic",
"PRTG Network Monitor",
"pshtt, https scanning",
"PTST ",
"PTST\\/[0-9]+",
"Pulsepoint XT3 web scraper",
"Python-httplib2",
"python-requests",
"Python-urllib",
"Qirina Hurdler",
"QQDownload",
"Qseero",
"QrafterPro",
"Qualidator.com SiteAnalyzer",
"Quora Link Preview",
"Qwantify",
"Radian6",
"RankSonicSiteAuditor",
"Readability",
"RealPlayer%20Downloader",
"RebelMouse",
"RecurPost\\/",
"redback\\/",
"Redirect Checker Tool",
"ReederForMac",
"request\\.js",
"ResponseCodeTest\\/[0-9]",
"RestSharp",
"RetrevoPageAnalyzer",
"Riddler",
"Rival IQ",
"Robosourcer",
"Robozilla\\/[0-9]",
"ROI Hunter",
"RPT-HTTPClient",
"RSSOwl",
"safe-agent-scanner",
"SalesIntelligent",
"Saleslift",
"SauceNAO",
"SBIder",
"scalaj-http",
"Scoop",
"scooter",
"ScoutJet",
"ScoutURLMonitor",
"Scrapy",
"ScreenShotService\\/[0-9]",
"Scrubby",
"search\\.thunderstone",
"Search37\\/",
"SearchSight",
"Seeker",
"semanticdiscovery",
"semanticjuice",
"Semiocast HTTP client",
"sentry\\/",
"SEO Browser",
"Seo Servis",
"seo-nastroj.cz",
"Seobility",
"SEOCentro",
"SeoCheck",
"SeopultContentAnalyzer",
"Server Density Service Monitoring",
"servernfo\\.com",
"SetCronJob\\/",
"Seznam screenshot-generator",
"Shelob",
"Shoppimon Analyzer",
"ShoppimonAgent\\/[0-9]",
"ShopWiki",
"ShortLinkTranslate",
"shrinktheweb",
"SilverReader",
"Sideqik",
"SimplePie",
"SimplyFast",
"Sitebulb\\/",
"SiteIndexed",
"Site-Shot\\/",
"Site24x7",
"SiteBar",
"SiteCondor",
"siteexplorer\\.info",
"SiteGuardian",
"Siteimprove\\.com",
"Sitemap(s)? Generator",
"SiteMonitor",
"Siteshooter B0t",
"SiteSucker",
"SiteTruth",
"sitexy\\.com",
"SkypeUriPreview",
"Slack\\/",
"slider\\.com",
"slurp",
"SMRF URL Expander",
"SMUrlExpander",
"Snappy",
"SniffRSS",
"sniptracker",
"Snoopy",
"SnowHaze Search",
"sogou web",
"SortSite",
"sovereign\\.ai",
"spaziodati",
"Specificfeeds",
"speedy",
"SPEng",
"Spinn3r",
"spray-can",
"Sprinklr ",
"sqlmap",
"spyonweb",
"Sqworm",
"SSL Labs",
"ssl-tools",
"StackRambler",
"Statastico\\/",
"StatusCake",
"Stratagems Kumo",
"Stroke.cz",
"StudioFACA",
"suchen",
"summify",
"Super Monitoring",
"Surphace Scout",
"SwiteScraper",
"Symfony2 BrowserKit",
"SynHttpClient-Built",
"Sysomos",
"Symfony BrowserKit",
"T0PHackTeam",
"Tarantula\\/",
"Taringa UGC",
"Tenon\\.io",
"teoma",
"terrainformatica\\.com",
"Test Certificate Info",
"Tetrahedron\\/[0-9]",
"Thinklab",
"The Drop Reaper",
"The Expert HTML Source Viewer",
"theinternetrules",
"theoldreader\\.com",
"Thumbshots",
"ThumbSniper",
"TinEye",
"Tiny Tiny RSS",
"TLSProbe\\/",
"topster",
"touche.com",
"Traackr.com",
"TrapitAgent",
"trendspottr\\.com",
"Trendsmap Resolver",
"truwoGPS",
"TulipChain",
"Twisted PageGetter",
"tweetedtimes\\.com",
"Tweetminster",
"Tweezler\\/",
"Twikle",
"Twingly",
"ubermetrics-technologies",
"uclassify",
"uCrawlr\\/",
"UdmSearch",
"UniversalFeedParser",
"Unshorten\\.It\\!\\/[0-9]",
"Untiny",
"UnwindFetchor",
"updated",
"updown\\.io daemon",
"Upflow",
"Uptimia",
"URL Verifier",
"URLChecker",
"URLitor.com",
"urlresolver",
"Urlstat",
"UrlTrends Ranking Updater",
"Vagabondo",
"vBSEO",
"via ggpht\\.com GoogleImageProxy",
"VidibleScraper\\/",
"visionutils",
"vkShare",
"voltron",
"voyager\\/",
"VSAgent\\/[0-9]",
"VSB-TUO\\/[0-9]",
"Vulnbusters Meter",
"VYU2",
"w3af\\.org",
"W3C-checklink",
"W3C-mobileOK",
"W3C_I18n-Checker",
"W3C_Unicorn",
"wangling",
"Wappalyzer",
"WatchMouse",
"WbSrch\\/",
"web-capture\\.net",
"Web-Monitoring",
"Web-sniffer",
"Webauskunft",
"WebCapture",
"WebClient\\/",
"webcollage",
"WebCookies",
"WebCopier",
"WebCorp",
"WebDoc",
"WebFetch",
"WebImages",
"WebIndex",
"webkit2png",
"webmastercoffee",
"webmon ",
"webscreenie",
"Webshot",
"Website Analyzer\\/",
"websitepulse agent",
"websitepulse[+ ]checker",
"Websnapr\\/",
"Webthumb\\/[0-9]",
"WebThumbnail",
"WeCrawlForThePeace",
"WeLikeLinks",
"WEPA",
"WeSEE",
"wf84",
"wget",
"WhatsApp",
"WhatsMyIP",
"WhatWeb",
"WhereGoes\\?",
"Whibse",
"WhoRunsCoinHive",
"Whynder Magnet",
"Windows-RSS-Platform",
"WinHttpRequest",
"wkhtmlto",
"wmtips",
"Woko",
"Word\\/",
"WordPress\\/",
"wotbox",
"WP Engine Install Performance API",
"wpif",
"wprecon\\.com survey",
"WPScan",
"wscheck",
"Wtrace",
"WWW-Mechanize",
"www\\.monitor\\.us",
"XaxisSemanticsClassifier",
"Xenu Link Sleuth",
"XING-contenttabreceiver\\/[0-9]",
"XmlSitemapGenerator",
"xpymep([0-9]?)\\.exe",
"Y!J-(ASR|BSC)",
"Yaanb",
"yacy",
"Yahoo Ad monitoring",
"Yahoo Link Preview",
"YahooCacheSystem",
"YahooYSMcm",
"YandeG",
"Yandex(?!Search)",
"yanga",
"yeti",
" YLT",
"Yo-yo",
"Yoleo Consumer",
"yoogliFetchAgent",
"YottaaMonitor",
"yourls\\.org",
"Your-Website-Sucks\/[0-9]",
"Zao",
"Zemanta Aggregator",
"Zend\\\\Http\\\\Client",
"Zend_Http_Client",
"zgrab",
"ZnajdzFoto",
"ZyBorg",
"[a-z0-9\\-_]*(bot|crawler|archiver|transcoder|spider|uptime|validator|fetcher)"
]
[
"Safari.[\\d\\.]*",
"Firefox.[\\d\\.]*",
"Chrome.[\\d\\.]*",
"Chromium.[\\d\\.]*",
"MSIE.[\\d\\.]",
"Opera\\/[\\d\\.]*",
"Mozilla.[\\d\\.]*",
"AppleWebKit.[\\d\\.]*",
"Trident.[\\d\\.]*",
"Windows NT.[\\d\\.]*",
"Android [\\d\\.]*",
"Macintosh.",
"Ubuntu",
"Linux",
"[ ]Intel",
"Mac OS X [\\d_]*",
"(like )?Gecko(.[\\d\\.]*)?",
"KHTML,",
"CriOS.[\\d\\.]*",
"CPU iPhone OS ([0-9_])* like Mac OS X",
"CPU OS ([0-9_])* like Mac OS X",
"iPod",
"compatible",
"x86_..",
"i686",
"x64",
"X11",
"rv:[\\d\\.]*",
"Version.[\\d\\.]*",
"WOW64",
"Win64",
"Dalvik.[\\d\\.]*",
" \\.NET CLR [\\d\\.]*",
"Presto.[\\d\\.]*",
"Media Center PC",
"BlackBerry",
"Build",
"Opera Mini\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\/\\d{1,2}\\.",
"Opera",
" \\.NET[\\d\\.]*",
"cubot",
";"
]
const exclude = require('./exclude.json');
const crawler = require('./crawler.json');
export function isCrawler (userAgent) {
const crawlerReg = new RegExp(`(${crawler.join('|')})`, 'i');
const excludeReg = new RegExp(`(${exclude.join('|')})`, 'gi');
const excludedUA = userAgent.replace(excludeReg, '');
return crawlerReg.test(excludedUA);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment