Skip to content

Instantly share code, notes, and snippets.

@meoow
Last active August 1, 2016 16:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save meoow/d17a51099f51521a9eb84174a48062f1 to your computer and use it in GitHub Desktop.
Save meoow/d17a51099f51521a9eb84174a48062f1 to your computer and use it in GitHub Desktop.
After looking around for existed implementations for converting gfwlist to pac file, suprisingly few of them producing the results really statisfies me, so I eventually ended up writing an arguably better version for the task.
#!/usr/bin/env python2.7
# This script simply produces a working pac file,
# no fancy input arguments support yet.
# If someone is really interested in using this script,
# download the gfwlist.txt, decode it through base64, name it as "gfwlist_decoded.txt"
# and put it in the same direcotry with this script
# run the script in terminal.
import re
import urlparse
import argparse
from collections import defaultdict
import json
import string
import sys
import os
import urllib2
import base64
from itertools import chain
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
# adjust the address
GFWLIST_URL = 'https://raw.githubusercontent.com/gfwlist/gfwlist/master/gfwlist.txt'
#{{{ shExpMatch
SHEXPMATCH = '''
function shExpMatch(url, pat) {
var pcharcode0;
var ucharcode0;
var pcharcode1;
if (pat.length === 0) {
if (url.length === 0) {
return true;
} else {
return false;
}
}
if (pat.length === url.length &&
pat.indexOf('*') < 0 &&
pat.indexOf('?') < 0) {
return pat === url;
}
ucharcode0 = url.charCodeAt(0);
pcharcode0 = pat.charCodeAt(0);
if (pcharcode0 === 42) { // pat[0] === '*'
pcharcode1 = pat.charCodeAt(1);
if (isNaN(pcharcode1)) {
return true;
} else if (pcharcode1 === 42) { // pat[1] === '*' skip continuous '*'
return shExpMatch(url, pat.substr(1));
} else {
if (url.length === 0) {
return false;
}
if (pcharcode1 === ucharcode0 || pcharcode1 === 63) {
if (shExpMatch(url.substr(1), pat)) {
return true;
} else {
return shExpMatch(url.substr(1), pat.substr(2));
}
} else {
return shExpMatch(url.substr(1), pat);
}
}
} else if (pcharcode0 === 63) {
if (url.length === 0) {
return false;
}
return shExpMatch(url.substr(1),pat.substr(1));
} else {
if (url.length === 0) {
return false;
}
if (ucharcode0 === pcharcode0) {
return shExpMatch(url.substr(1),pat.substr(1));
} else {
return false;
}
}
return false;
}
'''
#}}}
# top-level domains #{{{
SUFFIX_LIST = (
'.ac','.ad','.ae','.aero','.af','.ag','.ai','.al',
'.am','.an','.ao','.aq','.ar','.arpa','.as','.asia',
'.at','.au','.aw','.ax','.az','.ba','.bb','.be',
'.bf','.bg','.bh','.bi','.biz','.bj','.bm','.bo',
'.br','.bs','.bt','.bv','.bw','.by','.bz','.ca',
'.cat','.cc','.cd','.cf','.cg','.ch','.ci','.cl',
'.cm','.cn','.co','.com','.coop','.cr','.cu','.cv',
'.cw','.cx','.cz','.de','.dj','.dk','.dm','.do',
'.dz','.ec','.edu','.ee','.eg','.es','.eu','.fi',
'.fm','.fo','.fr','.ga','.gb','.gd','.ge','.gf',
'.gg','.gh','.gi','.gl','.gm','.gn','.gov','.gp',
'.gq','.gr','.gs','.gt','.gw','.gy','.hk','.hm',
'.hn','.hr','.ht','.hu','.id','.ie','.im','.in',
'.info','.int','.io','.iq','.ir','.is','.it','.je',
'.jo','.jobs','.jp','.kg','.ki','.km','.kn','.kp',
'.kr','.ky','.kz','.la','.lb','.lc','.li','.lk',
'.lr','.ls','.lt','.lu','.lv','.ly','.ma','.mc',
'.md','.me','.mg','.mh','.mil','.mk','.ml','.mn',
'.mo','.mobi','.mp','.mq','.mr','.ms','.mt','.mu',
'.museum','.mv','.mw','.mx','.my','.na','.name','.nc',
'.ne','.net','.nf','.ng','.nl','.no','.nr','.nu',
'.om','.org','.pa','.pe','.pf','.ph','.pk','.pl',
'.pm','.pn','.post','.pr','.pro','.ps','.pt','.pw',
'.py','.qa','.re','.ro','.rs','.ru','.rw','.sa',
'.sb','.sc','.sd','.se','.sg','.sh','.si','.sj',
'.sk','.sl','.sm','.sn','.so','.sr','.st','.su',
'.sv','.sx','.sy','.sz','.tc','.td','.tel','.tf',
'.tg','.th','.tj','.tk','.tl','.tm','.tn','.to',
'.tp','.travel','.tt','.tv','.tw','.tz','.ua','.ug',
'.us','.uy','.uz','.va','.vc','.ve','.vg','.vi',
'.vn','.vu','.wf','.ws','.yt','.xxx','.bzh','.rio',
'.camera','.clothing','.lighting','.singles','.ventures','.voyage',
'.guru','.holdings','.equipment','.bike','.estate','.tattoo',
'.land','.plumbing','.contractors','.sexy','.menu','.uno',
'.gallery','.technology','.reviews','.guide','.graphics','.construction',
'.onl','.diamonds','.kiwi','.enterprises','.today','.futbol',
'.photography','.tips','.directory','.kitchen','.kim','.monash',
'.wed','.pink','.ruhr','.buzz','.careers','.shoes',
'.career','.otsuka','.gift','.recipes','.coffee','.luxury',
'.domains','.photos','.limo','.viajes','.wang','.democrat',
'.mango','.cab','.support','.dance','.nagoya','.computer',
'.wien','.berlin','.codes','.email','.repair','.holiday',
'.center','.systems','.wiki','.ceo','.international','.solar',
'.company','.education','.training','.academy','.marketing','.florist',
'.solutions','.build','.institute','.builders','.red','.blue',
'.ninja','.business','.gal','.social','.house','.camp',
'.immobilien','.moda','.glass','.management','.kaufen','.farm',
'.club','.voting','.tokyo','.moe','.guitars','.bargains',
'.desi','.cool','.boutique','.pics','.cheap','.photo',
'.network','.zone','.link','.qpon','.agency','.tienda',
'.works','.london','.watch','.rocks','.shiksha','.budapest',
'.nrw','.vote','.fishing','.expert','.horse','.christmas',
'.cooking','.casa','.rich','.voto','.tools','.praxi',
'.events','.flights','.report','.partners','.neustar','.rentals',
'.catering','.community','.maison','.parts','.cleaning','.okinawa',
'.foundation','.properties','.vacations','.productions','.industries','.haus',
'.vision','.mormon','.cards','.ink','.villas','.consulting',
'.cruises','.krd','.xyz','.dating','.exposed','.condos',
'.eus','.caravan','.actor','.saarland','.yokohama','.pub',
'.ren','.fish','.bar','.dnp','.bid','.supply',
'.miami','.supplies','.quebec','.moscow','.globo','.axa',
'.vodka','.rest','.frogans','.wtc','.rodeo','.sohu',
'.best','.country','.kred','.feedback','.work','.luxe',
'.ryukyu','.autos','.homes','.jetzt','.yachts','.motorcycles',
'.mini','.ggee','.beer','.college','.ovh','.meet',
'.gop','.blackfriday','.lacaixa','.vegas','.black','.soy',
'.trade','.gent','.ing','.dad','.shriram','.bayern',
'.scot','.webcam','.foo','.eat','.nyc','.prod',
'.how','.day','.meme','.mov','.paris','.boo',
'.new','.ifm','.life','.archi','.spiegel','.brussels',
'.church','.here','.dabur','.vlaanderen','.cologne','.wme',
'.nhk','.suzuki','.whoswho','.scb','.hamburg','.services')
#}}}
# match list #{{{
PAC_VAR = '''
var PROXY = '${proxy}';
var DLIST = ${domains};
var SEARCH = ${search};
var WHITELIST_TLD = ['cn', 'lan', 'local', 'xn--fiqs8s'];'''
#}}}
# functions ${{{
PAC_FUNC = '''
function FindProxyForURL(url, host) {
host = host.toLowerCase();
if (/^\d+\.\d+\.\d+\.\d+$/g.test(host)||
host === 'localhost') {
return "DIRECT";
}
var xs = host.split('.');
if (xs.length === 1) {
return 'DIRECT';
}
var suffix = xs[xs.length-1];
if (WHITELIST_TLD.indexOf(suffix) != -1) {
return "DIRECT";
}
if (suffix in DLIST) {
var largeTLD = ['com', 'org', 'net'];
if (largeTLD.indexOf(suffix) != -1) {
var ssuffix = xs[xs.length-2];
if (ssuffix.length === 0) {
return 'DIRECT';
}
ssuffix = ssuffix[ssuffix.length-1];
if (ssuffix in DLIST[suffix]) {
for (var i = 0; i< DLIST[suffix][ssuffix].length;++i) {
if (shExpMatch(host,DLIST[suffix][ssuffix][i])){
return PROXY;
}
}
}
} else {
for (var i = 0; i < DLIST[suffix].length;++i) {
if (shExpMatch(host,DLIST[suffix][i])) {
return PROXY;
}
}
}
}
for (var i = 0; i < DLIST['unknown'].length; ++i) {
if(shExpMatch(host, DLIST['unknown'][i])) {
return PROXY;
}
}
for (var i=0;i<SEARCH.length;++i) {
if (shExpMatch(url, SEARCH[i])) {
return PROXY;
}
}
return 'DIRECT';
}
'''
#}}}
suffixPtn = re.compile('(\\{0})$'.format('|\\'.join(SUFFIX_LIST)), flags=re.I)
def parse_gfwlist_content(fobj):
domains = set()
searches = set()
httpPtn = re.compile(r'^\|https?://')
domainPtn = re.compile(r'^\|\|')
ignorePtn = re.compile(r'^([[@!]|$)')
slashPtn = re.compile(r'\\\/')
restPtn = re.compile('^[.\w]')
letterStartPtn = re.compile(r'^[a-z.]', flags=re.I)
#suffixPtn = re.compile('(\\{0})$'.format('|\\'.join(SUFFIX_LIST)), flags=re.I)
#print '(\\{0})$'.format('|\\'.join(SUFFIX_LIST))
wcMatPtn = re.compile(r'^(.+)(\*[^*]+)+$')
endwithdot = {True:'*', False:''}
for line in fobj:
domain = ''
line = line.strip()
if ignorePtn.match(line):
continue
if line.startswith('/') and line.endswith('/'):
# regex is too complicated to parse, and there are only a couple of addresses actually wrote in regex,
# I'd rather write their corresponding wildcard pattern manually.
continue
if line.endswith('/'):
line = line.rstrip('/')
if line.startswith('search*'):
searches.add('*/'+line)
elif letterStartPtn.match(line):
if suffixPtn.search(line):
domain = line
else:
if '/' in line:
line = line.split('/')[0]
mat = wcMatPtn.match(line)
if mat:
while True:
domain = mat.group(1)
if suffixPtn.search(domain):
break
mat = wcMatPtn.match(domain)
if not mat:
break
if domain:
if line.startswith('.'):
domains.add(domain.lstrip('.'))
domains.add('*'+domain)
else:
domains.add(domain)
domains.add('*.'+domain)
elif httpPtn.match(line):
pr = urlparse.urlparse(line.lstrip('|'))
if pr.netloc:
domains.add(pr.netloc)
elif domainPtn.match(line):
_d = line.lstrip('|')
domains.add('*.'+_d)
domains.add(_d)
domains.add('*.google.*')
domains.add('*.blogspot.*')
domains.add('*facebook.com')
discards = set()
for d in domains:
if d.startswith('*.www.') or d.endswith('.'):
discards.add(d)
domains.difference_update(discards)
return (domains, searches)
def main(inputs, output, proxy, fetchgfwlist=True, hasshexpmatch=False):
fobjList = []
domainsDict = defaultdict(lambda:[])
domainsDict['com'] = defaultdict(lambda:[])
domainsDict['org'] = defaultdict(lambda:[])
domainsDict['net'] = defaultdict(lambda:[])
outputhandle = open(output, 'wb')
try:
if fetchgfwlist:
resp = urllib2.urlopen(GFWLIST_URL)
fobjList.append(StringIO(base64.b64decode(resp.read())))
for i in inputs:
fobjList.append(open(i))
domains, searches = parse_gfwlist_content(chain(*fobjList))
for l in domains:
if suffixPtn.search(l):
xs = l.split('.')
if xs[-1] in ['com', 'org', 'net']:
domainsDict[xs[-1]][xs[-2][-1]].append(l)
else:
domainsDict[xs[-1]].append(l)
else:
domainsDict['unknown'].append(l)
tl = string.Template(PAC_VAR)
outputhandle.write(tl.substitute({
'proxy':proxy,
'domains':json.dumps(domainsDict),
'search':json.dumps(tuple(i for i in searches), indent=1)}))
outputhandle.write('\n')
if hasshexpmatch:
outputhandle.write(SHEXPMATCH)
outputhandle.write('\n')
outputhandle.write(PAC_FUNC)
finally:
for i in fobjList:
i.close()
outputhandle.close()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--proxy-type', choices=['PROXY', 'SOCKS'], dest='proxy', default='SOCKS',help='proxy server type, use PROXY(http) or SOCKS (default: SOCKS)')
parser.add_argument('-a', '--proxy-addr', dest='addr', default='127.0.0.1', help='proxy server address (default: 127.0.0.1)')
parser.add_argument('-p', '--proxy-port', dest='port', type=int, required=True, help='proxy server listening port')
parser.add_argument('-g', '--fetch-gfwlist', action='store_true', dest='fetchgfwlist', default=False, help='fetch gfwlist from <{0}>'.format(GFWLIST_URL))
parser.add_argument('-s', '--shexpmatch', action='store_true', dest='shexpmatch', default=False, help='use own shExpMatch, instead of depending on the host implementation')
parser.add_argument('-o', '--output', dest='output', default='autoconfig.pac', help='output file (default: autoconfig.pac)')
parser.add_argument('USERFILE', nargs='*', help='user provided gfwlist compatible file(s)')
args = parser.parse_args()
if not args.fetchgfwlist and len(args.USERFILE) == 0:
raise SystemExit('nothing to do')
proxy = '{0} {1}:{2}'.format(args.proxy, args.addr, args.port)
main(args.USERFILE, args.output, proxy, args.fetchgfwlist, args.shexpmatch)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment