Created
September 16, 2010 12:41
-
-
Save ayosec/582351 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ git clone git://gist.github.com/582351.git html_entities | |
... | |
$ cd html_entities/ | |
$ wget http://www.w3.org/2003/entities/2007xml/unicode.xml | |
... | |
$ ruby html_entities.rb > entities.rb | |
$ ruby html_entities.rb plain > entities.dat | |
$ ls -sh entities.* | |
28K entities.dat 68K entities.rb | |
$ irb | |
>> load "entities.rb" | |
=> true | |
>> HTMLEntities.size | |
=> 2264 | |
>> puts HTMLEntities["Aacute"] | |
Á | |
=> nil | |
>> entities = File.read("entities.dat").split("\0").inject({}) {|hash, line| line = line.split(" ", 2); hash[line[0]] = line[1]; hash }; entities.size | |
=> 2264 | |
>> puts entities["euro"] | |
€ | |
=> nil |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download unicode.xml from http://www.w3.org/2003/entities/2007xml/unicode.xml | |
require 'rubygems' | |
require 'nokogiri' | |
entities = {} | |
Nokogiri::XML(File.read("unicode.xml")).search("entity").each do |entity| | |
entity_name = entity["id"].to_s | |
if entity_name =~ /\A\w+\Z/ | |
if unicode_id = entity.parent["id"][/\AU([0-9a-z]+)\Z/i, 1] | |
entities[entity_name] = [unicode_id.to_i(16)].pack('U') | |
end | |
end | |
end | |
case ARGV[0] | |
when nil, "ruby-hash" | |
# Generate a nice ruby hash. | |
# It can be load directly in a ruby script | |
puts "HTMLEntities = {" | |
entities.keys.sort.each do |entity_name| | |
puts " #{entity_name.inspect} => #{entities[entity_name].inspect}," | |
end | |
puts "}" | |
when "plain" | |
# Plain file. Entities separated by \0. Name and character separated by an space | |
# It can be load with | |
# ruby> File.read("entities").split("\0").inject({}) {|hash, line| line = line.split(" ", 2); hash[line[0]] = line[1]; hash } | |
entities.keys.sort.each do |entity_name| | |
print entity_name | |
print " " | |
print entities[entity_name] | |
print "\0" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment