Created
December 8, 2019 21:32
-
-
Save douglashill/0e693ccc94c292171cfe48770d23a4a0 to your computer and use it in GitHub Desktop.
Tries really hard to read a string from a file. Brute forces encodings if necessary.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Douglas Hill, December 2019 | |
// Made for https://douglashill.co/reading-app/ | |
import Foundation | |
/// Tries really hard to read a string from a file. | |
/// | |
/// Brute forces encodings if necessary. This will only fail if the file can’t be interpreted | |
/// in any encoding, or if some other error occurs like not being able to read from the file. | |
/// | |
/// - Parameters: | |
/// - fileURL: The URL of the file to read from. | |
/// - suggestedEncoding: The expected encoding of the file or nil if no expectation can be given. | |
/// | |
/// - Returns: The string ready from the file, and the encoding that was used to interpret the file. | |
func readStringFromFileAtURL(_ fileURL: URL, suggestedEncoding: String.Encoding?) throws -> (String, String.Encoding) { | |
// First try the suggested encoding. | |
if let encoding = suggestedEncoding { | |
do { | |
let contents = try String(contentsOf: fileURL, encoding: encoding) | |
return (contents, encoding) | |
} catch CocoaError.fileReadInapplicableStringEncoding { | |
// The suggested encoding is wrong. Keep trying. | |
} catch { | |
// Bail on any other error. Might be something like reading from the file failed. | |
throw error | |
} | |
} | |
// Try the inference API. | |
// Inference seems to fail for ShiftJIS encoded text. For example from https://www.japan-guide.com/e/e2262.html | |
let inferenceError: Error | |
do { | |
var encoding = String.Encoding.windowsCP1252 | |
let contents = try String(contentsOf: fileURL, usedEncoding: &encoding) | |
return (contents, encoding) | |
} catch { | |
if case CocoaError.fileReadUnknownStringEncoding = error { | |
// We’ll keep trying, but save this error because it will make more sense to return than an error from brute forcing if brute forcing fails. | |
inferenceError = error | |
} else { | |
throw error | |
} | |
} | |
// Brute force with all possible encodings. | |
// This array is roughly sorted by likelihood based on https://w3techs.com/technologies/overview/character_encoding | |
let encodings: [String.Encoding] = [ | |
.utf8, | |
.isoLatin1, | |
.windowsCP1251, | |
.windowsCP1252, | |
.shiftJIS, | |
.japaneseEUC, | |
.isoLatin2, | |
.windowsCP1250, | |
.windowsCP1254, | |
// ASCII may seem redundant because it is a subset of UTF-8. However sometimes Foundation | |
// is able to read data as ASCII but not as UTF-8, so it is still worth trying. | |
// Example: https://signalvnoise.com/archives2/bezos_expeditions_invests_in_37signals.php | |
.ascii, | |
.nonLossyASCII, | |
.windowsCP1253, | |
.iso2022JP, | |
.symbol, | |
.nextstep, | |
.macOSRoman, | |
.unicode, | |
.utf16BigEndian, | |
.utf16LittleEndian, | |
.utf32, | |
.utf32BigEndian, | |
.utf32LittleEndian, | |
].filter { $0 != suggestedEncoding } | |
for encoding in encodings { | |
do { | |
let contents = try String(contentsOf: fileURL, encoding: encoding) | |
return (contents, encoding) | |
} catch CocoaError.fileReadInapplicableStringEncoding { | |
// Keep trying. | |
} catch { | |
throw error | |
} | |
} | |
// The inference error (fileReadUnknownStringEncoding) will make more sense than the one from brute forcing (fileReadInapplicableStringEncoding). | |
// The error truly is that the text encoding can’t be determined, even after trying really hard. | |
throw inferenceError | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment