Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Bot Detection Script. Works with Apache and Nginx Log Files.
sudo apt-get update
sudo apt-get install \
python-dev \
python-pip \
python-virtualenv
virtualenv findbots
source findbots/bin/activate
curl -O http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz
gunzip GeoLite2-City.mmdb.gz
pip install -e git+https://github.com/rory/apache-log-parser.git#egg=apache-log-parser \
-e git+https://github.com/selwin/python-user-agents.git#egg=python-user-agents \
colorama \
geoip2 \
netaddr
import sys
from urlparse import urlparse
import apache_log_parser
from colorama import Back, Style
import geoip2.database
from netaddr import IPNetwork, IPAddress
from user_agents import parse
reader = geoip2.database.Reader('GeoLite2-City.mmdb')
_format = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""
line_parser = apache_log_parser.make_parser(_format)
CIDRS = {
'Amazon': ['107.20.0.0/14', '122.248.192.0/19', '122.248.224.0/19',
'172.96.96.0/20', '174.129.0.0/16', '175.41.128.0/19',
'175.41.160.0/19', '175.41.192.0/19', '175.41.224.0/19',
'176.32.120.0/22', '176.32.72.0/21', '176.34.0.0/16',
'176.34.144.0/21', '176.34.224.0/21', '184.169.128.0/17',
'184.72.0.0/15', '185.48.120.0/26', '207.171.160.0/19',
'213.71.132.192/28', '216.182.224.0/20', '23.20.0.0/14',
'46.137.0.0/17', '46.137.128.0/18', '46.51.128.0/18',
'46.51.192.0/20', '50.112.0.0/16', '50.16.0.0/14', '52.0.0.0/11',
'52.192.0.0/11', '52.192.0.0/15', '52.196.0.0/14',
'52.208.0.0/13', '52.220.0.0/15', '52.28.0.0/16', '52.32.0.0/11',
'52.48.0.0/14', '52.64.0.0/12', '52.67.0.0/16', '52.68.0.0/15',
'52.79.0.0/16', '52.80.0.0/14', '52.84.0.0/14', '52.88.0.0/13',
'54.144.0.0/12', '54.160.0.0/12', '54.176.0.0/12',
'54.184.0.0/14', '54.188.0.0/14', '54.192.0.0/16',
'54.193.0.0/16', '54.194.0.0/15', '54.196.0.0/15',
'54.198.0.0/16', '54.199.0.0/16', '54.200.0.0/14',
'54.204.0.0/15', '54.206.0.0/16', '54.207.0.0/16',
'54.208.0.0/15', '54.210.0.0/15', '54.212.0.0/15',
'54.214.0.0/16', '54.215.0.0/16', '54.216.0.0/15',
'54.218.0.0/16', '54.219.0.0/16', '54.220.0.0/16',
'54.221.0.0/16', '54.224.0.0/12', '54.228.0.0/15',
'54.230.0.0/15', '54.232.0.0/16', '54.234.0.0/15',
'54.236.0.0/15', '54.238.0.0/16', '54.239.0.0/17',
'54.240.0.0/12', '54.242.0.0/15', '54.244.0.0/16',
'54.245.0.0/16', '54.247.0.0/16', '54.248.0.0/15',
'54.250.0.0/16', '54.251.0.0/16', '54.252.0.0/16',
'54.253.0.0/16', '54.254.0.0/16', '54.255.0.0/16',
'54.64.0.0/13', '54.72.0.0/13', '54.80.0.0/12', '54.72.0.0/15',
'54.79.0.0/16', '54.88.0.0/16', '54.93.0.0/16', '54.94.0.0/16',
'63.173.96.0/24', '72.21.192.0/19', '75.101.128.0/17',
'79.125.64.0/18', '96.127.0.0/17'],
'Baidu': ['180.76.0.0/16', '119.63.192.0/21', '106.12.0.0/15',
'182.61.0.0/16'],
'DO': ['104.131.0.0/16', '104.236.0.0/16', '107.170.0.0/16',
'128.199.0.0/16', '138.197.0.0/16', '138.68.0.0/16',
'139.59.0.0/16', '146.185.128.0/21', '159.203.0.0/16',
'162.243.0.0/16', '178.62.0.0/17', '178.62.128.0/17',
'188.166.0.0/16', '188.166.0.0/17', '188.226.128.0/18',
'188.226.192.0/18', '45.55.0.0/16', '46.101.0.0/17',
'46.101.128.0/17', '82.196.8.0/21', '95.85.0.0/21', '95.85.32.0/21'],
'Dream': ['173.236.128.0/17', '205.196.208.0/20', '208.113.128.0/17',
'208.97.128.0/18', '67.205.0.0/18'],
'Google': ['104.154.0.0/15', '104.196.0.0/14', '107.167.160.0/19',
'107.178.192.0/18', '108.170.192.0/20', '108.170.208.0/21',
'108.170.216.0/22', '108.170.220.0/23', '108.170.222.0/24',
'108.59.80.0/20', '130.211.128.0/17', '130.211.16.0/20',
'130.211.32.0/19', '130.211.4.0/22', '130.211.64.0/18',
'130.211.8.0/21', '146.148.16.0/20', '146.148.2.0/23',
'146.148.32.0/19', '146.148.4.0/22', '146.148.64.0/18',
'146.148.8.0/21', '162.216.148.0/22', '162.222.176.0/21',
'173.255.112.0/20', '192.158.28.0/22', '199.192.112.0/22',
'199.223.232.0/22', '199.223.236.0/23', '208.68.108.0/23',
'23.236.48.0/20', '23.251.128.0/19', '35.184.0.0/14',
'35.188.0.0/15', '35.190.0.0/17', '35.190.128.0/18',
'35.190.192.0/19', '35.190.224.0/20', '8.34.208.0/20',
'8.35.192.0/21', '8.35.200.0/23',],
'Hetzner': ['129.232.128.0/17', '129.232.156.128/28', '136.243.0.0/16',
'138.201.0.0/16', '144.76.0.0/16', '148.251.0.0/16',
'176.9.12.192/28', '176.9.168.0/29', '176.9.24.0/27',
'176.9.72.128/27', '178.63.0.0/16', '178.63.120.64/27',
'178.63.156.0/28', '178.63.216.0/29', '178.63.216.128/29',
'178.63.48.0/26', '188.40.0.0/16', '188.40.108.64/26',
'188.40.132.128/26', '188.40.144.0/24', '188.40.48.0/26',
'188.40.48.128/26', '188.40.72.0/26', '196.40.108.64/29',
'213.133.96.0/20', '213.239.192.0/18', '41.203.0.128/27',
'41.72.144.192/29', '46.4.0.128/28', '46.4.192.192/29',
'46.4.84.128/27', '46.4.84.64/27', '5.9.144.0/27',
'5.9.192.128/27', '5.9.240.192/27', '5.9.252.64/28',
'78.46.0.0/15', '78.46.24.192/29', '78.46.64.0/19',
'85.10.192.0/20', '85.10.228.128/29', '88.198.0.0/16',
'88.198.0.0/20'],
'Linode': ['104.200.16.0/20', '109.237.24.0/22', '139.162.0.0/16',
'172.104.0.0/15', '173.255.192.0/18', '178.79.128.0/21',
'198.58.96.0/19', '23.92.16.0/20', '45.33.0.0/17',
'45.56.64.0/18', '45.79.0.0/16', '50.116.0.0/18',
'80.85.84.0/23', '96.126.96.0/19'],
}
def in_block(ip, block):
_ip = IPAddress(ip)
return any([True
for cidr in block
if _ip in IPNetwork(cidr)])
def bot_test(req, agent):
ua_tokens = ['daum/', # Daum Communications Corp.
'gigablastopensource',
'go-http-client',
'http://',
'httpclient',
'https://',
'libwww-perl',
'phantomjs',
'proxy',
'python',
'sitesucker',
'wada.vn',
'webindex',
'wget']
is_bot = agent.is_bot or \
any([True
for cidr in CIDRS.values()
if in_block(req['remote_host'], cidr)]) or \
any([True
for token in ua_tokens
if token in agent.ua_string.lower()])
return is_bot
if __name__ == '__main__':
while True:
try:
line = sys.stdin.readline()
except KeyboardInterrupt:
break
if not line:
break
req = line_parser(line)
agent = parse(req['request_header_user_agent'])
uri = urlparse(req['request_url'])
try:
response = reader.city(req['remote_host'])
country, city = response.country.iso_code, response.city.name
except:
country, city = None, None
is_bot = bot_test(req, agent)
agent_str = ''.join([item
for item in agent.browser[0:3] +
agent.device[0:3] +
agent.os[0:3]
if item is not None and
type(item) is not tuple and
len(item.strip()) and
item != 'Other'])
ip_owner_str = ', '.join([network + ' IP'
for network, cidr in CIDRS.iteritems()
if in_block(req['remote_host'], cidr)])
print Back.RED + 'b' if is_bot else 'h', \
country, \
city, \
uri.path, \
agent_str, \
ip_owner_str, \
Style.RESET_ALL
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment