Skip to content

Instantly share code, notes, and snippets.

@afair
Created June 5, 2012 20:37
Show Gist options
  • Save afair/2877685 to your computer and use it in GitHub Desktop.
Save afair/2877685 to your computer and use it in GitHub Desktop.
UData - A Ruby 1.9 Unicode Demonstration Class
#!/usr/bin/env ruby
# encoding: UTF-8
###############################################################################
# Ruby 1.9 Unicode Demonstration
###############################################################################
require 'rubygems'
require 'unicode' # gem unicode
require 'unicode_utils' # gem unicode_utils
require 'rchardet19' # gem rchardet19
require 'punycode' # gem punycode4r
class UData
attr_accessor :data, :encoding
def initialize(data)
@data = data
end
def self.read(filename, file_encoding=nil, as_encoding=nil)
File.open(filename, 'r',
internal_encoding:as_encoding||file_encoding||'UTF-8',
external_encoding:file_encoding||'UTF-8') do |f|
@data = f.read
end
@data
end
def write(filename, as_encoding=nil)
File.open(filename, 'w',
internal_encoding:@data.encoding.name,
external_encoding:as_encoding||@data.encoding.name) do |f|
f.write @data
end
end
def to_s
@data
end
def self.to_encoding(str, encoding, opt={})
return str if str.encoding.name == encoding
ec = Encoding::Converter.new(str.encoding.name, encoding,
{invalid: :replace, undef: :replace}.merge(opt))
ec.convert(str)
end
def to_encoding(encoding, opt={})
UData.to_encoding(@data, encoding, opt)
end
def to_encoding!(encoding)
@data = to_encoding(encoding)
end
def to_utf8
to_encoding('UTF-8')
end
def self.is_ascii?(str)
str.match(/^[\u0000-\u007f]+$/)
end
def is_ascii?
# Not Implemented: @data =~ /^\p{InBasic_Latin}+$/
@data.match(/^[\u0000-\u007f]+$/)
end
def remove_non_ascii
to_encoding('ASCII', replace:'')
end
def searchable
d = remove_diacritics
d = simple_quotes
d = Unicode.downcase(d)
d
end
def self.remove_diacritics(str)
UnicodeUtils.nfkd(str).gsub(/\p{Mark}+/,'').to_s
end
def remove_diacritics
# Wrong: Decomposes and removes non-ASCII characters.
# id = UnicodeUtils.nfkd(to_utf8).gsub(/[^\x00-\x7F]/,'').to_s
# UData.to_encoding(d,e)
# Closer: But will remove marks from all characters (not just letters)
# UnicodeUtils.nfkd(to_utf8).gsub(/\p{Mark}+/,'').to_s # Marks are \u0300-\u036F
UnicodeUtils.nfkd(to_utf8).gsub(/(\p{Letter})\p{Mark}+/,'\\1').to_s # Marks are \u0300-\u036F
end
def remove_diacritics!
@data = remove_diacritics
end
def simple_quotes
d = @data.tr("\u00ab\u00bb\u201c\u201d\u201e\u201f\u301d\u301e\u301f\uff02",'"')
d = d.tr("\u2018\u2019\u201a\u201b\u2039\u203a\uff07", "'")
end
# Normalization rewrites string in standard decomposition/composition format
# to allow for comparisons on data with same characters using different code points
def normalize()
Unicode::nfkd(to_utf8) # nfkd: normalization form KD (compatabililty decomposition)
end
def normalize!
@data = normalize
end
def upcase
Unicode.upcase(@data)
end
def downcase
Unicode.downcase(@data)
end
def capitalize
Unicode.capitalize(@data)
end
def to_punycode
Punycode.encode(@data)
end
def from_punycode
Punycode.decode(@data)
end
end
DATA = {
fr: "Résumé",
en: "Hello world",
cn: "你好世界",
jp: "こんにちは、世界",
ar: "مرحبا العالم",
gr: "Γεια σας κόσμο",
he: "שלום עולם",
ru: "привет мир",
vn: "Xin chào thế giới",
ko: "안녕하세요 세계",
ge: "weiß",
sy: "\u201cQuote\u2019s\u201d \u00a92012 R\u00e9sum\u0065\u0301", # “Quote’s” ©2012 Résumé
}
DATA.each do |lang, data|
d = UData.new(data)
puts "UData: #{d}"
puts " Upcase: #{d.upcase}"
puts " Remove Diacritics: #{d.remove_diacritics}"
puts " Simple Quotes: #{d.simple_quotes}"
puts " Normalize: #{d.normalize}"
puts " Punycode: #{d.to_punycode}"
puts ""
end
puts UData.new("http://☁→❄→☃→☀→☺→☂→☹→✝.ws/").to_punycode
f = UData.read('u')
puts f.encoding.name
f = UData.read('latin1')
puts f.encoding.name, f
#d = UData.new(DATA[:fr])
#
#f.normalize!
#lines = f.data.split(/\n/)
#p lines.sort.inspect
r1 = "R\u00e9sum\u0065\u0301"
r2 = "R\u00e9sum\u00e9"
puts "#{r1} == #{r2}? #{r1==r2 ? 'true' : 'false'}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment