Created
April 20, 2012 03:57
-
-
Save kastiglione/2425888 to your computer and use it in GitHub Desktop.
Generate a TLD regex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
# monkey patching, but we're still cool right? | |
class Fixnum | |
def to_proc | |
->(obj) { obj[self] } | |
end | |
end | |
class Hash | |
def map_values!(&block) | |
update(self) { |key, value| block.call(value) } | |
end | |
end | |
class String | |
# Compress a string by replacing sequences of 3 or more successive | |
# letters with the first letter, a hyphen, and the last letter. | |
# | |
# I really wish I could offer nicer, more functional code for this, | |
# maybe one day | |
def sequence | |
# nothing can be done with 2 or less letters | |
return self if length <= 2 | |
# start with the first two letters | |
result = self[0..1] | |
self[2..-1].each_char do |char| | |
# get the last two letters from the results, "y" and "z" | |
y, z = result[-2..-1].split('') | |
# we have a hit if "char" is successive to the previous letter ("z") | |
if z.succ == char | |
# we have the beginning of a hyphenatable sequence if the | |
# three letters "y", "z", and "char" are all successive | |
if y.succ == z | |
result[-1] = "-#{char}" | |
next | |
# if we already have a sequence going, just update it | |
elsif y == '-' | |
result[-1] = char | |
next | |
end | |
end | |
# in all other cases, add char to the resul | |
result << char | |
end | |
result | |
end | |
end | |
tlds = open('http://data.iana.org/TLD/tlds-alpha-by-domain.txt').readlines | |
# ignore comments and IDNA TLDs | |
tlds.delete_if { |tld| tld =~ /^(#|XN--)/ }.each(&:chomp!) | |
# separate by size, the magic happens only on the 2 letter TLDs | |
biggie, smalls = tlds.group_by{ |tld| tld.length > 2 }.values_at(true, false) | |
# group the tlds by their first letter | |
indexed = smalls.group_by(&0) | |
# index is now { 'a' => ['ab', 'ac', 'ad'] } | |
indexed.map_values! do |tlds| | |
# keep only the last letter of the TLDs for each grouping | |
tlds.map(&-1) | |
end | |
# index is now { 'a' => ['b', 'c', 'd'] } | |
# generate the regex segments | |
small_regexes = indexed.map { |letter, mates| | |
suffix = mates.length > 1 ? "[#{mates.join.sequence}]" : mates.join | |
"#{letter}#{suffix}" | |
} | |
# combine to a single regex | |
puts '/' + biggie.concat(small_regexes).join('|') + '/i' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment