Created
October 31, 2012 08:14
-
-
Save heathd/3985793 to your computer and use it in GitHub Desktop.
Character Encoding Cleaner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby -w | |
require 'pp' | |
require 'set' | |
require 'colorize' | |
require 'forwardable' | |
unless ARGV.size >= 1 | |
puts "Usage: __FILE__ <input file> [<output file>]" | |
exit(1) | |
end | |
input_file = ARGV[0] | |
output_file = ARGV[1] | |
data = File.open(input_file, 'r:binary').read | |
class FilePartitioner | |
def initialize(regexp) | |
@regexp = regexp | |
end | |
def partition(data) | |
pos = 0 | |
data = data.dup.freeze | |
tail = data | |
parts = [] | |
begin | |
head, match, tail = tail.partition(@regexp) | |
parts << Extent.new(data, pos, pos + head.size - 1) | |
if ! match.empty? | |
parts << Extent.new(data, pos + head.size, pos + head.size + match.size - 1) | |
pos += head.size + match.size | |
end | |
end while !match.empty? | |
parts | |
end | |
class Extent < Struct.new(:data, :from, :to) | |
include Comparable | |
def value | |
data[from..to] | |
end | |
def to_s | |
value | |
end | |
def in_context(context_size = 30) | |
context_start = [from - context_size, 0].max | |
context_end = [to + context_size, data.size].min | |
pre_context = data[context_start...from] | |
post_context = data[to+1...context_end] | |
sanitize(pre_context) + to_s.inspect.white.on_red + sanitize(post_context) | |
end | |
def sanitize(str) | |
str.tr(badchars, "") | |
end | |
def badchars | |
((0..31).map(&:chr) + (0x80...0xa0).map(&:chr)).join("") | |
end | |
def <=>(other) | |
self.value <=> other.value | |
end | |
def hash | |
self.value.hash | |
end | |
def eql?(other) | |
self.value == other.value | |
end | |
end | |
end | |
class Mappings | |
attr_reader :mappings | |
def initialize | |
@mappings = Hash.new | |
load | |
end | |
def load | |
File.open(filename, 'r:binary').read.chomp.split("\n").map do |line| | |
bad_sequence, replacement = parse_line(line) | |
@mappings[bad_sequence] = replacement | |
end | |
rescue Errno::ENOENT => e | |
[] | |
end | |
def parse_line(line) | |
bad_ascii, replacement = line.split(":") | |
bad_sequence = bad_ascii.split('\x').reject(&:empty?).map {|byte| byte.to_i(16).chr}.join("") | |
[bad_sequence, replacement == 'TODO' ? nil : (replacement || "")] | |
end | |
def save(file = nil) | |
File.open(file || filename, 'w:binary') do |f| | |
@mappings.each do |bad_sequence, replacement| | |
bytestring = '\x' + bad_sequence.each_byte.map do |byte| | |
byte.to_s(16).upcase | |
end.join('\x') | |
f.write(bytestring) | |
f.write(":") | |
f.write(replacement || "TODO") | |
f.write("\n") | |
end | |
end | |
end | |
def filename | |
"mappings.txt" | |
end | |
def include?(bad) | |
@mappings.has_key?(bad.to_s) | |
end | |
def done?(bad) | |
! @mappings[bad.to_s].nil? | |
end | |
def add(bad) | |
@mappings[bad.to_s] = nil unless @mappings.has_key?(bad.to_s) | |
end | |
def fix(data) | |
new_data = data.dup | |
in_order = @mappings.sort_by {|k,v| k}.reverse | |
first_nil_idx = in_order.find_index {|bad_sequence, replacement| replacement.nil?} || in_order.size | |
in_order[0...first_nil_idx].each do |bad_sequence, replacement| | |
new_data.gsub!(bad_sequence, replacement) | |
end | |
new_data | |
end | |
def is_replacement_target?(str) | |
@mappings.values.include?(str) | |
end | |
end | |
mappings = Mappings.new | |
data = mappings.fix(data) | |
if output_file | |
File.open(output_file, 'w:binary') do |f| | |
f.write(data) | |
end | |
end | |
f = FilePartitioner.new(/[\x80-\xff]+/n) | |
parts = f.partition(data) | |
parts.each_slice(2) do |good, bad| | |
next unless bad | |
next if mappings.is_replacement_target?(bad.to_s) | |
mappings.add(bad) | |
puts bad.in_context unless mappings.done?(bad) | |
end | |
mappings.save |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
\xC3\x83\xC6\x92\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xA3:£ | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x82\xAC\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x84\xA2:’ | |
\xC3\x83\xC6\x92\xC3\x82\xC6\x92\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9A\xC3\x83\xC6\x92\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xA3:£ | |
\xC3\x83\xC6\x92\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xB5:µ | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x82\xAC\xC3\x83\xE2\x80\x9A\xC3\x82\xCB\x9C:‘ | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xAC\xC3\x83\xC6\x92\xC3\x82\xE2\x80\xB9\xC3\x83\xE2\x80\x9A\xC3\x82\xC5\x93:' | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xAC\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x82\xAC\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\x9D:— | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9E\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xA2:™ | |
\xC3\x83\xC6\x92\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82: | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x82\xAC\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9C:– | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xAC\xC3\x83\xC6\x92\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xB2:’ | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x82\xAC\xC3\x83\xE2\x80\x9A\xC3\x82\xC5\x93:“ | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x82\xAC\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\x9D:” | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x82\xAC\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xA6:… | |
\xC3\x83\xC6\x92\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xAE:® | |
\xC3\x83\xC6\x92\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xAD: | |
\xC3\x83\xC6\x92\xC3\x82\xC6\x92\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9A\xC2\xA3:£ | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xAC\xC2\xB2:’ | |
\xC3\x83\xC6\x92\xC3\x82\xC2\xA2\xC3\x83\xE2\x80\x9A\xC3\x82\xE2\x80\x9A\xC3\x83\xE2\x80\x9A\xC3\x82\xC2\xAC\xE2\x80\x9D:— |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment