Created
June 11, 2017 18:08
-
-
Save pzb/5aba13a67bd9fa64b3769397c842889b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Only used for tests | |
require 'simpleidn' | |
class PSLNode | |
def initialize | |
@children = {} | |
@terminus = false | |
@exception = false | |
end | |
def add_rule(x, excpt) | |
lbl = x.shift | |
if lbl.nil? | |
raise 'Duplicate rule' if @terminus | |
@terminus = true | |
@exception = excpt | |
return | |
end | |
@children[lbl] ||= PSLNode.new | |
@children[lbl].add_rule(x, excpt) | |
end | |
def get_regdom(lbls, matched_lbls = []) | |
lbl = lbls.shift | |
if lbl.nil? | |
return matched_lbls if @terminus && @exception | |
return nil | |
end | |
return @children[lbl].get_regdom(lbls, matched_lbls << lbl) if @children.key?(lbl) | |
return @children['*'].get_regdom(lbls, matched_lbls << lbl) if @children.key?('*') | |
if @terminus | |
return matched_lbls if @exception | |
return matched_lbls << lbl | |
end | |
nil | |
end | |
def walk(prefix = '', &block) | |
block.call(prefix, @exception) if @terminus | |
@children.keys.sort.each do |label| | |
@children[label].walk(label + '.' + prefix, &block) | |
end | |
end | |
end | |
class PSLTree | |
def initialize | |
@root = PSLNode.new | |
# Default rule, as per spec | |
@root.add_rule(['*'], false) | |
end | |
def add_rule(rulestr) | |
exception = false | |
if rulestr[0] == '!' | |
exception = true | |
rulestr = rulestr[1..-1] | |
end | |
lbls = rulestr.split('.').reverse | |
@root.add_rule(lbls, exception) | |
end | |
def get_regdom(dom) | |
dom.downcase! | |
lbls = dom.split('.').reverse | |
r = @root.get_regdom(lbls) | |
return r if r.nil? | |
r.reverse.join('.') | |
end | |
# walk and test are not used to in the core code | |
def walk(&block) | |
@root.walk(&block) | |
end | |
def test(input, expected) | |
ex = SimpleIDN.to_unicode(expected) | |
ex = nil if ex == 'null' | |
rd = get_regdom(SimpleIDN.to_unicode(input)) | |
result = (rd == ex) | |
puts "test(#{input}, #{expected}) = #{result} (#{rd})" | |
result | |
end | |
end | |
PRIVATE_DELIM = '===BEGIN PRIVATE DOMAINS==='.freeze | |
COMMENT_PREFIX = '//'.freeze | |
SPACE_RE = /\p{Space}/ | |
SPACE_LINE = /\A\p{Space}*\z/ | |
psl = PSLTree.new | |
privreg = false | |
## The Public Suffix List consists of a series of lines, separated by \n. | |
IO.foreach(ARGV[0], "\n") do |l| | |
if !privreg && l.include?(PRIVATE_DELIM) | |
privreg = true | |
next | |
end | |
## Each line is only read up to the first whitespace; entire lines can also be commented using //. | |
## Each line which is not entirely whitespace or begins with a comment contains a rule. | |
next if l.start_with?(COMMENT_PREFIX) | |
next if l =~ SPACE_LINE | |
rule = l.split(SPACE_RE).first | |
psl.add_rule(rule) | |
end | |
psl.walk do |n, expt| | |
pfx = expt ? '!' : '' | |
puts "#{pfx}#{n}" | |
end | |
null = nil | |
# The following applies to lines below this point. | |
# From: https://raw.githubusercontent.com/publicsuffix/list/master/tests/test_psl.txt | |
# Any copyright is dedicated to the Public Domain. | |
# https://creativecommons.org/publicdomain/zero/1.0/ | |
# Mixed case. | |
psl.test('COM', null); | |
psl.test('example.COM', 'example.com'); | |
psl.test('WwW.example.COM', 'example.com'); | |
# Leading dot. | |
psl.test('.com', null); | |
psl.test('.example', null); | |
psl.test('.example.com', null); | |
psl.test('.example.example', null); | |
# Unlisted TLD. | |
psl.test('example', null); | |
psl.test('example.example', 'example.example'); | |
psl.test('b.example.example', 'example.example'); | |
psl.test('a.b.example.example', 'example.example'); | |
# Listed, but non-Internet, TLD. | |
#checkPublicSuffix('local', null); | |
#checkPublicSuffix('example.local', null); | |
#checkPublicSuffix('b.example.local', null); | |
#checkPublicSuffix('a.b.example.local', null); | |
# TLD with only 1 rule. | |
psl.test('biz', null); | |
psl.test('domain.biz', 'domain.biz'); | |
psl.test('b.domain.biz', 'domain.biz'); | |
psl.test('a.b.domain.biz', 'domain.biz'); | |
# TLD with some 2-level rules. | |
psl.test('com', null); | |
psl.test('example.com', 'example.com'); | |
psl.test('b.example.com', 'example.com'); | |
psl.test('a.b.example.com', 'example.com'); | |
psl.test('uk.com', null); | |
psl.test('example.uk.com', 'example.uk.com'); | |
psl.test('b.example.uk.com', 'example.uk.com'); | |
psl.test('a.b.example.uk.com', 'example.uk.com'); | |
psl.test('test.ac', 'test.ac'); | |
# TLD with only 1 (wildcard) rule. | |
psl.test('mm', null); | |
psl.test('c.mm', null); | |
psl.test('b.c.mm', 'b.c.mm'); | |
psl.test('a.b.c.mm', 'b.c.mm'); | |
# More complex TLD. | |
psl.test('jp', null); | |
psl.test('test.jp', 'test.jp'); | |
psl.test('www.test.jp', 'test.jp'); | |
psl.test('ac.jp', null); | |
psl.test('test.ac.jp', 'test.ac.jp'); | |
psl.test('www.test.ac.jp', 'test.ac.jp'); | |
psl.test('kyoto.jp', null); | |
psl.test('test.kyoto.jp', 'test.kyoto.jp'); | |
psl.test('ide.kyoto.jp', null); | |
psl.test('b.ide.kyoto.jp', 'b.ide.kyoto.jp'); | |
psl.test('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp'); | |
psl.test('c.kobe.jp', null); | |
psl.test('b.c.kobe.jp', 'b.c.kobe.jp'); | |
psl.test('a.b.c.kobe.jp', 'b.c.kobe.jp'); | |
psl.test('city.kobe.jp', 'city.kobe.jp'); | |
psl.test('www.city.kobe.jp', 'city.kobe.jp'); | |
# TLD with a wildcard rule and exceptions. | |
psl.test('ck', null); | |
psl.test('test.ck', null); | |
psl.test('b.test.ck', 'b.test.ck'); | |
psl.test('a.b.test.ck', 'b.test.ck'); | |
psl.test('www.ck', 'www.ck'); | |
psl.test('www.www.ck', 'www.ck'); | |
# US K12. | |
psl.test('us', null); | |
psl.test('test.us', 'test.us'); | |
psl.test('www.test.us', 'test.us'); | |
psl.test('ak.us', null); | |
psl.test('test.ak.us', 'test.ak.us'); | |
psl.test('www.test.ak.us', 'test.ak.us'); | |
psl.test('k12.ak.us', null); | |
psl.test('test.k12.ak.us', 'test.k12.ak.us'); | |
psl.test('www.test.k12.ak.us', 'test.k12.ak.us'); | |
# IDN labels. | |
psl.test('食狮.com.cn', '食狮.com.cn'); | |
psl.test('食狮.公司.cn', '食狮.公司.cn'); | |
psl.test('www.食狮.公司.cn', '食狮.公司.cn'); | |
psl.test('shishi.公司.cn', 'shishi.公司.cn'); | |
psl.test('公司.cn', null); | |
psl.test('食狮.中国', '食狮.中国'); | |
psl.test('www.食狮.中国', '食狮.中国'); | |
psl.test('shishi.中国', 'shishi.中国'); | |
psl.test('中国', null); | |
# Same as above, but punycoded. | |
psl.test('xn--85x722f.com.cn', 'xn--85x722f.com.cn'); | |
psl.test('xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); | |
psl.test('www.xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); | |
psl.test('shishi.xn--55qx5d.cn', 'shishi.xn--55qx5d.cn'); | |
psl.test('xn--55qx5d.cn', null); | |
psl.test('xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); | |
psl.test('www.xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); | |
psl.test('shishi.xn--fiqs8s', 'shishi.xn--fiqs8s'); | |
psl.test('xn--fiqs8s', null); | |
psl.test("www.goog", "goog") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment