RevRagnarok/block_ips.py

## block_ips.py
#!/bin/env python
from __future__ import print_function
from collections import defaultdict
import os.path
import re
import sys
import urllib2
from HTMLParser import HTMLParser

"""
This script will pull IP ranges out of the "pre" sections of websites and manipulate them
to generate iptables rules. The output is STDOUT; do with it what you need.

Released as GPL by RevRagnarok 2018
"""

# The IP host pages
source_pages = ['https://www.wizcrafts.net/chinese-iptables-blocklist.html',
                'https://www.wizcrafts.net/nigerian-iptables-blocklist.html',
                'https://www.wizcrafts.net/russian-iptables-blocklist.html',
                'https://www.wizcrafts.net/lacnic-iptables-blocklist.html',
               ]

# What you need before and after the IP (this example is OpenWRT):
pre_text = 'iptables -A input_rule --src'
post_text = '-j DROP'

# Comment style:
# 0 = none
# 1 = last comment found in line (looks nice but more memory taken)
# 2 = just filename
comment_style = 2

# Comment prefix (if you want to easily be able to grep for statistics)
comment_prefix = 'XXX:'
#comment_prefix = ''

# Debugging
# If set to true, uses local copies of the files
local_debug = False

### End user-serviceable parts
comment_text = '-m comment --comment' if comment_style != 0 else ''

# Global regex for IP ranges
ip_re = re.compile("(\d+\.){3}\d(/\d+)?")

# Global regex for lines to skip
blank_re = re.compile('^\s*$')

# Global page being parsed (for comment style 2)
page_name = ''

def print_loop_start():
  """ Start of bash loop """
  print("for ip in ", end='')

def print_loop_end(comment):
  """ End of bash loop """
  print("; do")
  print(' ', pre_text, '${ip}', post_text, comment)
  print('done')

def print_ip_bash_loop(all_ips):
  """ Takes a dict of comment keys that have list of IPs and creates bash loops """
  max_loop_size = 100 # How many IPs max per bash loop
  max_line_size = 10 # How many IPs max before newline inserted
  current_loop_size = 0
  current_line_size = 0
  for cmnt in all_ips.keys():
    # print("Found", len(all_ips[cmnt]), "IPs for '"+cmnt+"'", file=sys.stderr)
    print_loop_start()
    current_loop_size = 0
    current_line_size = 0
    while(all_ips[cmnt]):
      current_line_size += 1
      current_loop_size += 1
      if current_loop_size > max_loop_size:
        print_loop_end(cmnt)
        print_loop_start()
        current_loop_size = 0
        current_line_size = 0
      elif current_line_size > max_line_size:
        print('\\')
        print('    ', end='')
        current_line_size = 1
      ip = all_ips[cmnt].pop()
      print(ip, '', end='')
    # current_loop_size = 0
    print_loop_end(cmnt)

def block_parser(data):
  """ Parses the 'pre' block from the HTML """
  # Store the data into a dict
  all_ips = defaultdict(list)
  comment = '"'+comment_prefix+page_name.split('-')[0].capitalize()+'"' if 2 == comment_style else ''
  # There is one line in one file that the parser really doesn't like because the HTML Parser does not like bare '&'. Oh well.
  for line in data.splitlines():
    if blank_re.match(line): # Handle blanks
      continue
    if '#' == line[0]: # Update comment
      if 1 == comment_style:
        comment = '"'+comment_prefix+''.join(c for c in line[1:] if c not in '#\'":').lstrip() # remove things bash won't like
        comment = comment.replace("follow", '').replace("IP addresses", '').rstrip()+'"'
      continue
    # Check if it looks like an IP or not
    if not ip_re.match(line):
      print('"'+line+'" does not look like an IP!', file=sys.stderr)
      continue
    # What remains is what we want
    # print(pre_text, line, post_text, comment_text+' '+comment if comment and comment_text else '')
    # all_ips.append([line, comment_text+' '+comment if comment and comment_text else ''])
    this_comment = comment_text+' '+comment if comment and comment_text else ''
    all_ips[this_comment].append(line)
  # At end of loop, output the IP/comment pairs as a bash loop (original = 318719, final = 67181)
  print_ip_bash_loop(all_ips)

class MyHTMLParser(HTMLParser):
  """ My custom HTML parser; looks for "pre" tags and sends them to block_parser """
  def __init__(self):
    HTMLParser.__init__(self)
    self.in_pre = False

  def handle_starttag(self, tag, attrs):
    self.in_pre = 'pre' == tag

  def handle_endtag(self, tag):
    self.in_pre = False

  def handle_data(self, data):
    if self.in_pre:
      block_parser(data)

# main
print("#!/bin/sh")
parser = MyHTMLParser()
for page in source_pages:
  page_name = page.split('/')[-1]
  print("#", page_name, "follows:")
  if (local_debug):
    print("Faking", page, file=sys.stderr)
    with open(page_name) as f:
      parser.feed(f.read())
  else:
    print("Pulling", page, file=sys.stderr)
    parser.feed(urllib2.urlopen(page).read())
	#!/bin/env python
	from __future__ import print_function
	from collections import defaultdict
	import os.path
	import re
	import sys
	import urllib2
	from HTMLParser import HTMLParser

	"""
	This script will pull IP ranges out of the "pre" sections of websites and manipulate them
	to generate iptables rules. The output is STDOUT; do with it what you need.

	Released as GPL by RevRagnarok 2018
	"""

	# The IP host pages
	source_pages = ['https://www.wizcrafts.net/chinese-iptables-blocklist.html',
	'https://www.wizcrafts.net/nigerian-iptables-blocklist.html',
	'https://www.wizcrafts.net/russian-iptables-blocklist.html',
	'https://www.wizcrafts.net/lacnic-iptables-blocklist.html',
	]

	# What you need before and after the IP (this example is OpenWRT):
	pre_text = 'iptables -A input_rule --src'
	post_text = '-j DROP'

	# Comment style:
	# 0 = none
	# 1 = last comment found in line (looks nice but more memory taken)
	# 2 = just filename
	comment_style = 2

	# Comment prefix (if you want to easily be able to grep for statistics)
	comment_prefix = 'XXX:'
	#comment_prefix = ''

	# Debugging
	# If set to true, uses local copies of the files
	local_debug = False

	### End user-serviceable parts
	comment_text = '-m comment --comment' if comment_style != 0 else ''

	# Global regex for IP ranges
	ip_re = re.compile("(\d+\.){3}\d(/\d+)?")

	# Global regex for lines to skip
	blank_re = re.compile('^\s*$')

	# Global page being parsed (for comment style 2)
	page_name = ''

	def print_loop_start():
	""" Start of bash loop """
	print("for ip in ", end='')

	def print_loop_end(comment):
	""" End of bash loop """
	print("; do")
	print(' ', pre_text, '${ip}', post_text, comment)
	print('done')

	def print_ip_bash_loop(all_ips):
	""" Takes a dict of comment keys that have list of IPs and creates bash loops """
	max_loop_size = 100 # How many IPs max per bash loop
	max_line_size = 10 # How many IPs max before newline inserted
	current_loop_size = 0
	current_line_size = 0
	for cmnt in all_ips.keys():
	# print("Found", len(all_ips[cmnt]), "IPs for '"+cmnt+"'", file=sys.stderr)
	print_loop_start()
	current_loop_size = 0
	current_line_size = 0
	while(all_ips[cmnt]):
	current_line_size += 1
	current_loop_size += 1
	if current_loop_size > max_loop_size:
	print_loop_end(cmnt)
	print_loop_start()
	current_loop_size = 0
	current_line_size = 0
	elif current_line_size > max_line_size:
	print('\\')
	print(' ', end='')
	current_line_size = 1
	ip = all_ips[cmnt].pop()
	print(ip, '', end='')
	# current_loop_size = 0
	print_loop_end(cmnt)

	def block_parser(data):
	""" Parses the 'pre' block from the HTML """
	# Store the data into a dict
	all_ips = defaultdict(list)
	comment = '"'+comment_prefix+page_name.split('-')[0].capitalize()+'"' if 2 == comment_style else ''
	# There is one line in one file that the parser really doesn't like because the HTML Parser does not like bare '&'. Oh well.
	for line in data.splitlines():
	if blank_re.match(line): # Handle blanks
	continue
	if '#' == line[0]: # Update comment
	if 1 == comment_style:
	comment = '"'+comment_prefix+''.join(c for c in line[1:] if c not in '#\'":').lstrip() # remove things bash won't like
	comment = comment.replace("follow", '').replace("IP addresses", '').rstrip()+'"'
	continue
	# Check if it looks like an IP or not
	if not ip_re.match(line):
	print('"'+line+'" does not look like an IP!', file=sys.stderr)
	continue
	# What remains is what we want
	# print(pre_text, line, post_text, comment_text+' '+comment if comment and comment_text else '')
	# all_ips.append([line, comment_text+' '+comment if comment and comment_text else ''])
	this_comment = comment_text+' '+comment if comment and comment_text else ''
	all_ips[this_comment].append(line)
	# At end of loop, output the IP/comment pairs as a bash loop (original = 318719, final = 67181)
	print_ip_bash_loop(all_ips)

	class MyHTMLParser(HTMLParser):
	""" My custom HTML parser; looks for "pre" tags and sends them to block_parser """
	def __init__(self):
	HTMLParser.__init__(self)
	self.in_pre = False

	def handle_starttag(self, tag, attrs):
	self.in_pre = 'pre' == tag

	def handle_endtag(self, tag):
	self.in_pre = False

	def handle_data(self, data):
	if self.in_pre:
	block_parser(data)

	# main
	print("#!/bin/sh")
	parser = MyHTMLParser()
	for page in source_pages:
	page_name = page.split('/')[-1]
	print("#", page_name, "follows:")
	if (local_debug):
	print("Faking", page, file=sys.stderr)
	with open(page_name) as f:
	parser.feed(f.read())
	else:
	print("Pulling", page, file=sys.stderr)
	parser.feed(urllib2.urlopen(page).read())