Skip to content

Instantly share code, notes, and snippets.

@TellowKrinkle
Last active May 9, 2019 06:51
Show Gist options
  • Save TellowKrinkle/529d560d4ecf1131e1f7a8810c31f631 to your computer and use it in GitHub Desktop.
Save TellowKrinkle/529d560d4ecf1131e1f7a8810c31f631 to your computer and use it in GitHub Desktop.
Guesses an unknown UInt16 → Character file encoding of a file based on some strings known to exist in it (Remember to compile with optimizations!)
import Foundation
extension String {
func leftpad(to size: Int, with char: Character) -> String {
if count >= size { return self }
return String(repeating: char, count: size - count) + self
}
}
guard CommandLine.arguments.count > 2 else {
FileHandle.standardError.write(Data("""
Usage: \(CommandLine.arguments[0]) unknownEncodingFile knownSentences.txt
The program will attempt to figure out which 16-bit characters map to which sentences by looking for strings in knownSentences.txt
knownSentences.txt should be a newline-separated list of sentences that are known to exist in the file with unknown encoding\n
""".utf8))
exit(EXIT_FAILURE)
}
let data = try Data(contentsOf: URL(fileURLWithPath: CommandLine.arguments[1]))
let words = data.withUnsafeBytes { $0.bindMemory(to: UInt16.self).map { UInt16(bigEndian: $0) } }
let sentences = try String(contentsOf: URL(fileURLWithPath: CommandLine.arguments[2])).split(whereSeparator: { CharacterSet.whitespacesAndNewlines.contains($0.unicodeScalars.first!) }).map(String.init)
func getPossibleMappings(_ str: String, in: UnsafeBufferPointer<UInt16>, knownMappings: [Character: UInt16], possibleMappings: [Character: Set<UInt16>]) -> [Character: Set<UInt16>] {
let inPtr = `in`.baseAddress!
// Mapping of codepoints to whether they've already been assigned
let inUse = UnsafeMutablePointer<Bool>.allocate(capacity: 65536)
inUse.initialize(repeating: false, count: 65536)
defer { inUse.deinitialize(count: 65536); inUse.deallocate() }
for known in knownMappings.values { inUse[Int(known)] = true }
// Holds resulting string mappings
var results: Set<[UInt16]> = []
// The mappings that are required
var required: [(stringOffset: Int, required: UInt16)] = []
// Conversion from characters to their offsets in the mapping array
var used: [Character: Int] = [:]
// The mappings that are variable
var variable: [(stringOffset: Int, mappingArrayOffset: Int)] = []
var allowedVariable: [Set<UInt16>?] = []
// Number of unique variable mappings
var uniqueVariable = 0
for (offset, character) in str.enumerated() {
if let mapped = knownMappings[character] {
required.append((offset, mapped))
}
else if let existing = used[character] {
variable.append((offset, existing))
}
else {
allowedVariable.append(possibleMappings[character])
used[character] = uniqueVariable
variable.append((offset, uniqueVariable))
uniqueVariable += 1
}
}
let length = required.count + variable.count
let mapping = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: uniqueVariable)
mapping.initialize(repeating: 0)
defer { mapping.deallocate() }
let mappingPtr = mapping.baseAddress! // For avoiding bounds checks the compiler can't check properly
outerFor: for index in 0..<(`in`.count - length) {
let ptr = inPtr.advanced(by: index)
// Verify that all required characters match
for (offset, requiredValue) in required {
if ptr[offset] != requiredValue {
continue outerFor
}
}
var filled = -1
defer {
for index in (0..<(filled + 1)) {
inUse[Int(mappingPtr[index])] = false
}
}
// Check if we can map variable characters in
for (offset, mappingIndex) in variable {
let char = ptr[offset]
if mappingIndex > filled {
// New character we haven't seen before
if inUse[Int(char)] {
continue outerFor
}
inUse[Int(char)] = true
mappingPtr[mappingIndex] = char
filled = mappingIndex
}
else {
// Character that has already been assigned
if mappingPtr[mappingIndex] != char {
continue outerFor
}
}
}
// Fully mapped! Let's try to save
precondition(filled == uniqueVariable - 1)
results.insert(Array(mapping))
}
results = results.filter {
$0.enumerated().allSatisfy {
allowedVariable[$0.offset] == nil || allowedVariable[$0.offset]!.contains($0.element)
}
}
if results.isEmpty { return [:] }
var out = [Character: Set<UInt16>]()
for (character, offset) in used {
out[character] = Set(results.map { $0[offset] })
}
return out
}
struct MappingTracker {
public private(set) var possible: [Character: Set<UInt16>] = [:]
public private(set) var known: [Character: UInt16] = [:]
var updated = false
mutating func addMapping(from: Character, to: Set<UInt16>) {
var new = to
if let prevPossible = possible[from] {
new = prevPossible.intersection(to)
if new.count != prevPossible.count { updated = true }
}
else {
updated = true
}
if new.count == 1 {
possible[from] = nil
known[from] = new.first!
}
else {
possible[from] = new
}
}
}
do {
// Sort the sentences by the ones with the most repeated characters (since those are the most selective)
var currentSentences = sentences.sorted(by: { $0.count - Set($0).count > $1.count - Set($1).count })
var mappings = MappingTracker()
var full = true
while true {
mappings.updated = false
var done: Set<String> = []
for sentence in currentSentences {
let newMappings = words.withUnsafeBufferPointer {
getPossibleMappings(sentence, in: $0, knownMappings: mappings.known, possibleMappings: mappings.possible)
}
if newMappings.isEmpty && sentence.contains(where: { mappings.known[$0] == nil }) {
full = false
print("Failed to map \(sentence)")
done.insert(sentence)
}
for (character, possibles) in newMappings {
mappings.addMapping(from: character, to: possibles)
}
if !sentence.contains(where: { mappings.known[$0] == nil }) {
done.insert(sentence)
}
}
if !mappings.updated { break }
if !done.isEmpty { currentSentences = currentSentences.filter { !done.contains($0) } }
}
print(mappings)
for (from, to) in mappings.known.sorted(by: { $0.value < $1.value }) {
print("0x\(String(to, radix: 16).leftpad(to: 4, with: "0")): \(from)")
}
if !full {
print("Wasn't able to map all requested sentences!")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment