Last active
May 9, 2019 06:51
-
-
Save TellowKrinkle/529d560d4ecf1131e1f7a8810c31f631 to your computer and use it in GitHub Desktop.
Guesses an unknown UInt16 → Character file encoding of a file based on some strings known to exist in it (Remember to compile with optimizations!)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
extension String { | |
func leftpad(to size: Int, with char: Character) -> String { | |
if count >= size { return self } | |
return String(repeating: char, count: size - count) + self | |
} | |
} | |
guard CommandLine.arguments.count > 2 else { | |
FileHandle.standardError.write(Data(""" | |
Usage: \(CommandLine.arguments[0]) unknownEncodingFile knownSentences.txt | |
The program will attempt to figure out which 16-bit characters map to which sentences by looking for strings in knownSentences.txt | |
knownSentences.txt should be a newline-separated list of sentences that are known to exist in the file with unknown encoding\n | |
""".utf8)) | |
exit(EXIT_FAILURE) | |
} | |
let data = try Data(contentsOf: URL(fileURLWithPath: CommandLine.arguments[1])) | |
let words = data.withUnsafeBytes { $0.bindMemory(to: UInt16.self).map { UInt16(bigEndian: $0) } } | |
let sentences = try String(contentsOf: URL(fileURLWithPath: CommandLine.arguments[2])).split(whereSeparator: { CharacterSet.whitespacesAndNewlines.contains($0.unicodeScalars.first!) }).map(String.init) | |
func getPossibleMappings(_ str: String, in: UnsafeBufferPointer<UInt16>, knownMappings: [Character: UInt16], possibleMappings: [Character: Set<UInt16>]) -> [Character: Set<UInt16>] { | |
let inPtr = `in`.baseAddress! | |
// Mapping of codepoints to whether they've already been assigned | |
let inUse = UnsafeMutablePointer<Bool>.allocate(capacity: 65536) | |
inUse.initialize(repeating: false, count: 65536) | |
defer { inUse.deinitialize(count: 65536); inUse.deallocate() } | |
for known in knownMappings.values { inUse[Int(known)] = true } | |
// Holds resulting string mappings | |
var results: Set<[UInt16]> = [] | |
// The mappings that are required | |
var required: [(stringOffset: Int, required: UInt16)] = [] | |
// Conversion from characters to their offsets in the mapping array | |
var used: [Character: Int] = [:] | |
// The mappings that are variable | |
var variable: [(stringOffset: Int, mappingArrayOffset: Int)] = [] | |
var allowedVariable: [Set<UInt16>?] = [] | |
// Number of unique variable mappings | |
var uniqueVariable = 0 | |
for (offset, character) in str.enumerated() { | |
if let mapped = knownMappings[character] { | |
required.append((offset, mapped)) | |
} | |
else if let existing = used[character] { | |
variable.append((offset, existing)) | |
} | |
else { | |
allowedVariable.append(possibleMappings[character]) | |
used[character] = uniqueVariable | |
variable.append((offset, uniqueVariable)) | |
uniqueVariable += 1 | |
} | |
} | |
let length = required.count + variable.count | |
let mapping = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: uniqueVariable) | |
mapping.initialize(repeating: 0) | |
defer { mapping.deallocate() } | |
let mappingPtr = mapping.baseAddress! // For avoiding bounds checks the compiler can't check properly | |
outerFor: for index in 0..<(`in`.count - length) { | |
let ptr = inPtr.advanced(by: index) | |
// Verify that all required characters match | |
for (offset, requiredValue) in required { | |
if ptr[offset] != requiredValue { | |
continue outerFor | |
} | |
} | |
var filled = -1 | |
defer { | |
for index in (0..<(filled + 1)) { | |
inUse[Int(mappingPtr[index])] = false | |
} | |
} | |
// Check if we can map variable characters in | |
for (offset, mappingIndex) in variable { | |
let char = ptr[offset] | |
if mappingIndex > filled { | |
// New character we haven't seen before | |
if inUse[Int(char)] { | |
continue outerFor | |
} | |
inUse[Int(char)] = true | |
mappingPtr[mappingIndex] = char | |
filled = mappingIndex | |
} | |
else { | |
// Character that has already been assigned | |
if mappingPtr[mappingIndex] != char { | |
continue outerFor | |
} | |
} | |
} | |
// Fully mapped! Let's try to save | |
precondition(filled == uniqueVariable - 1) | |
results.insert(Array(mapping)) | |
} | |
results = results.filter { | |
$0.enumerated().allSatisfy { | |
allowedVariable[$0.offset] == nil || allowedVariable[$0.offset]!.contains($0.element) | |
} | |
} | |
if results.isEmpty { return [:] } | |
var out = [Character: Set<UInt16>]() | |
for (character, offset) in used { | |
out[character] = Set(results.map { $0[offset] }) | |
} | |
return out | |
} | |
struct MappingTracker { | |
public private(set) var possible: [Character: Set<UInt16>] = [:] | |
public private(set) var known: [Character: UInt16] = [:] | |
var updated = false | |
mutating func addMapping(from: Character, to: Set<UInt16>) { | |
var new = to | |
if let prevPossible = possible[from] { | |
new = prevPossible.intersection(to) | |
if new.count != prevPossible.count { updated = true } | |
} | |
else { | |
updated = true | |
} | |
if new.count == 1 { | |
possible[from] = nil | |
known[from] = new.first! | |
} | |
else { | |
possible[from] = new | |
} | |
} | |
} | |
do { | |
// Sort the sentences by the ones with the most repeated characters (since those are the most selective) | |
var currentSentences = sentences.sorted(by: { $0.count - Set($0).count > $1.count - Set($1).count }) | |
var mappings = MappingTracker() | |
var full = true | |
while true { | |
mappings.updated = false | |
var done: Set<String> = [] | |
for sentence in currentSentences { | |
let newMappings = words.withUnsafeBufferPointer { | |
getPossibleMappings(sentence, in: $0, knownMappings: mappings.known, possibleMappings: mappings.possible) | |
} | |
if newMappings.isEmpty && sentence.contains(where: { mappings.known[$0] == nil }) { | |
full = false | |
print("Failed to map \(sentence)") | |
done.insert(sentence) | |
} | |
for (character, possibles) in newMappings { | |
mappings.addMapping(from: character, to: possibles) | |
} | |
if !sentence.contains(where: { mappings.known[$0] == nil }) { | |
done.insert(sentence) | |
} | |
} | |
if !mappings.updated { break } | |
if !done.isEmpty { currentSentences = currentSentences.filter { !done.contains($0) } } | |
} | |
print(mappings) | |
for (from, to) in mappings.known.sorted(by: { $0.value < $1.value }) { | |
print("0x\(String(to, radix: 16).leftpad(to: 4, with: "0")): \(from)") | |
} | |
if !full { | |
print("Wasn't able to map all requested sentences!") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment