Skip to content

Instantly share code, notes, and snippets.

@knwng
Forked from gwjwin/gfwlist2regex.py
Last active July 12, 2020 16:35
Show Gist options
  • Save knwng/94715f80dcf0aaaadc025c5739cdb555 to your computer and use it in GitHub Desktop.
Save knwng/94715f80dcf0aaaadc025c5739cdb555 to your computer and use it in GitHub Desktop.
Download and convert GFWList to url regex which compatible with Squid. Compatible with py3
#!/usr/bin/env python
#encoding: utf-8
import urllib.request
import re
from base64 import b64decode
LIST_URL = 'https://raw.githubusercontent.com/gfwlist/gfwlist/master/gfwlist.txt'
DECODE_FILE = 'decode.txt'
BLACK_FILE = 'gfw.url_regex.lst'
WHITE_FILE = 'cn.url_regex.lst'
def convert_line(line):
line = line.rstrip()
#regex already
if line[0] == '/' and line[-1] == '/':
#remove https?:\/\/[^\/]+
rline = line[1:-1]
rline = rline.replace(r'^https?:\/\/[^\/]+', r'^[^\/]+')
return rline
if line.startswith('||'):
rline = line[2:]
rline = rline.replace(r'http://', '')
rline = rline.replace(r'https://', '')
rline = re.escape(rline)
rline = rline.replace(r'\*', '(.*)')
#return '^https?:\/\/[^\/]+' + rline
return '^[^\/]*' + rline
elif line.startswith('|'):
rline = line[1:]
rline = rline.replace(r'http://', '')
rline = rline.replace(r'https://', '')
rline = re.escape(rline)
rline = rline.replace(r'\*', '.*')
return '^' + rline
elif line[-1] == '|':
rline = line[:-1]
rline = rline.replace(r'http://', '')
rline = rline.replace(r'https://', '')
rline = re.escape(rline)
rline = rline.replace(r'\*', '.*')
return rline + '$'
else:
rline = line
rline = rline.replace(r'http://', '')
rline = rline.replace(r'https://', '')
rline = re.escape(rline)
rline = rline.replace(r'\*', '.*')
return rline
def convert(gfwlist):
with open(BLACK_FILE, 'w') as black, open(WHITE_FILE, 'w') as white:
for l in gfwlist.split('\n'):
# l = l[:-1]
if not l or l[0] == '!' or l[0] == '[':
continue
if l.startswith('@@'):
white.write(convert_line(l[2:]) + '\n')
else:
black.write(convert_line(l) + '\n')
def main():
src = urllib.request.urlopen(LIST_URL).read()
src = b64decode(src).decode('utf-8')
with open(DECODE_FILE, 'w') as decode:
decode.write(src)
# decode.close()
# src = open(DECODE_FILE, 'r').read()
convert(src)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment