Skip to content

Instantly share code, notes, and snippets.

@lanceliao
Created August 14, 2014 05:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lanceliao/e127d10dd6096bb5c064 to your computer and use it in GitHub Desktop.
Save lanceliao/e127d10dd6096bb5c064 to your computer and use it in GitHub Desktop.
Generate a list of dnsmasq(with ipset) rules for blocked alexa top 1000 domains
#!/usr/bin/env python
#coding=utf-8
#
# Generate a list of dnsmasq(with ipset) rules for
# Censorship of Alexa Top 1000 Domains in China
#
# Copyright (C) 2014 http://www.shuyz.com
#
import urllib2
import re
import os
import datetime
# the url of search result
baseurl = 'https://en.greatfire.org/search/alexa-top-1000-domains?page=%s'
pattern = r'<a href=".*">(.*?)<\/a><\/td><td>.*<\/td><td class=".*\d+%;">(\d+%)<\/td>'
mydnsip = '127.0.0.1'
mydnsport = '1053'
outfile = 'gfw_alexa1000.conf'
fs = file(outfile, 'w')
fs.write('# GFW blocked Alexa Top 1000 Domains for dnsmasq\n')
fs.write('# updated on ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '\n')
fs.write('#\n')
# remember all blocked domains, in case of duplicate records
domainlist = []
for i in range(0, 11):
print 'fetching page ' + baseurl%(i)
content = urllib2.urlopen(baseurl, timeout=15).read()
print 'page content fetched, analysing...'
items = re.findall(pattern, content)
for item in items:
try:
found = domainlist.index(item[0])
print item[0] + ' exists.'
except ValueError:
domainlist.append(item[0])
print item[0] + ' is ' + item[1] + ' blocked.'
if item[1] != '0%':
fs.write('server=/.%s/%s#%s\n'%(item[0],mydnsip,mydnsport))
fs.write('ipset=/.%s/gfw_alexa1000\n'%item[0])
# endfor
fs.close();
print 'done!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment