Created
May 11, 2015 16:19
-
-
Save airspeedswift/2f4fede44f208ed45340 to your computer and use it in GitHub Desktop.
CSV-parsing code in Swift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// after http://www.cocoawithlove.com/2009/11/writing-parser-using-nsscanner-csv.html | |
import Foundation | |
extension NSScanner { | |
func scanUpToCharactersFromSet(set: NSCharacterSet) -> String? { | |
var str: NSString? | |
return self.scanUpToCharactersFromSet(set, intoString: &str) | |
? str as? String | |
: nil | |
} | |
func scanCharactersFromSet(set: NSCharacterSet) -> String? { | |
var str: NSString? | |
return self.scanCharactersFromSet(set, intoString: &str) | |
? str as? String | |
: nil | |
} | |
func scanString(match: String) -> String? { | |
return self.scanString(match, intoString: nil) ? match : nil | |
} | |
} | |
/// Parses a string representing records | |
struct DelimitedDataParser { | |
private var scanner_: NSScanner | |
private let endTextCharacterSet_: NSCharacterSet | |
private let separator_: String | |
private let separatorIsSingleChar_: Bool | |
private var fieldNames_: [String] = [] | |
/// Creates a delimited data parser to parse a supplied string of | |
/// newline-separated records. | |
/// | |
/// Parameters: | |
/// data - the string that will be parsed | |
/// separator - the separator (normally "," or "\t") | |
/// header - if true, treats the first row as a list of field names | |
/// fieldNames - list of field names (has no effect if `header` is true) | |
init(data: String, separator: String, hasHeader: Bool = false, fieldNames: [String] = []) { | |
if let firstSeparator = first(separator) { | |
let endTextCharacterSet = NSMutableCharacterSet.newlineCharacterSet() | |
endTextCharacterSet.addCharactersInString("\(firstSeparator)\"" ) | |
endTextCharacterSet_ = endTextCharacterSet | |
} | |
else { | |
preconditionFailure("Separator cannot be empty") | |
} | |
precondition(!contains(separator, "\""), "Separator cannot contain double-quote") | |
precondition(!contains(separator.utf16, NSCharacterSet.newlineCharacterSet().characterIsMember ), "Separator cannot contain a new line character") | |
separator_ = separator | |
scanner_ = NSScanner(string: data) | |
scanner_.charactersToBeSkipped = NSCharacterSet() | |
scanner_.caseSensitive = true | |
separatorIsSingleChar_ = dropFirst(separator).isEmpty | |
if hasHeader, let fieldNames = parseHeader() { | |
fieldNames_ = fieldNames | |
parseLineSeparator() | |
} | |
else { | |
fieldNames_ = fieldNames | |
} | |
} | |
mutating func array() -> [[String:String]] { | |
return parseFile() | |
} | |
mutating func parseFile() -> [[String:String]] { | |
var records: [[String:String]] = [] | |
while let record = parseRecord() { | |
records.append(record) | |
if parseLineSeparator() == nil { break } | |
} | |
return records | |
} | |
mutating func parseHeader() -> [String]? { | |
var names: [String] = [] | |
while let name = parseName() { | |
names.append(name) | |
if parseSeparator() == nil { break } | |
} | |
return names.isEmpty ? nil : names | |
} | |
mutating func parseDoubleQuote() -> String? { | |
return scanner_.scanString("\"") | |
} | |
mutating func parseTwoDoubleQuotes() -> String? { | |
return scanner_.scanString("\"\"") | |
} | |
mutating func parseSeparator() -> String? { | |
return scanner_.scanString(separator_) | |
} | |
mutating func parseLineSeparator() -> String? { | |
return scanner_.scanCharactersFromSet(.newlineCharacterSet()) | |
} | |
mutating func parseName() -> String? { return parseField() } | |
mutating func parseNonEscaped() -> String? { return parseTextData() } | |
mutating func parseField() -> String? { | |
if let escapedString = self.parseEscaped() { | |
return escapedString | |
} | |
if let nonEscapedString = self.parseNonEscaped() { | |
return nonEscapedString | |
} | |
// | |
// Special case: if the current location is immediately | |
// followed by a separator, then the field is a valid, empty string. | |
// | |
let currentLocation = scanner_.scanLocation | |
if self.parseSeparator() != nil || self.parseLineSeparator() != nil || scanner_.atEnd { | |
scanner_.scanLocation = currentLocation | |
return "" | |
} | |
return nil | |
} | |
mutating func parseHeader() -> [String] { | |
var names: [String] = [] | |
while let name = parseName() { | |
names.append(name) | |
if self.parseSeparator() == nil { | |
return names | |
} | |
} | |
return names | |
} | |
mutating func parseTextData() -> String? { | |
var accumulatedData = "" | |
while true { | |
if let fragment = scanner_.scanUpToCharactersFromSet(endTextCharacterSet_) { | |
accumulatedData += fragment | |
} | |
if separatorIsSingleChar_ { | |
break | |
} | |
let location = scanner_.scanLocation | |
if let firstCharOfSeparator = first(separator_), | |
let first = scanner_.scanString(String(firstCharOfSeparator)) { | |
if let next = scanner_.scanString(dropFirst(separator_)) { | |
scanner_.scanLocation = location | |
break | |
} | |
accumulatedData.append(firstCharOfSeparator) | |
continue | |
} | |
else { | |
break | |
} | |
} | |
return accumulatedData.isEmpty ? nil : accumulatedData | |
} | |
mutating func parseEscaped() -> String? { | |
if self.parseDoubleQuote() == nil { | |
return nil | |
} | |
var accumulatedData = "" | |
while let fragment = | |
parseTextData() | |
?? parseSeparator() | |
?? parseLineSeparator() | |
?? parseTwoDoubleQuotes().map({_ in "\""}) | |
{ | |
accumulatedData += fragment | |
} | |
if self.parseDoubleQuote() == nil { | |
return nil | |
} | |
return accumulatedData | |
} | |
mutating func parseRecord() -> [String:String]? { | |
// | |
// Special case: return nil if the line is blank. Without this special case, | |
// it would parse as a single blank field. | |
// | |
if self.parseLineSeparator() != nil || scanner_.atEnd { | |
return nil | |
} | |
var fieldCount = 0 | |
var record: [String:String] = [:] | |
while let field = self.parseField() { | |
let fieldName: String | |
if fieldCount < fieldNames_.count { | |
fieldName = fieldNames_[fieldCount] | |
} | |
else { | |
fieldName = "FIELD_\(fieldCount + 1)" | |
fieldNames_.append(fieldName) | |
} | |
record[fieldName] = field | |
fieldCount++ | |
if self.parseSeparator() == nil { | |
break | |
} | |
} | |
return record | |
} | |
} | |
var error: NSError? | |
if Process.argc != 2 { | |
println("Usage: \(Process.arguments[0]) <pathToInputCSVFile>") | |
exit(1) | |
} | |
let inputPath = Process.arguments[1] | |
let csvData: String | |
if let csvString = NSString(contentsOfFile: inputPath, encoding: NSUTF8StringEncoding, error: &error) { | |
csvData = csvString as String | |
} | |
else { | |
println("Couldn\'t read file at path \(inputPath)") | |
if let err = error?.localizedDescription ?? error?.description { println("Error: \(err)") } | |
exit(1) | |
} | |
let startDate = NSDate() | |
let fieldNames = [ | |
"postcode", | |
"suburb", | |
"state", | |
"postOffice", | |
"type", | |
"latitude", | |
"longitude", | |
] | |
var parser = DelimitedDataParser(data: csvData, separator: ",", fieldNames: fieldNames) | |
let records = parser.array() | |
let duration = NSDate().timeIntervalSinceDate(startDate) | |
NSLog("\(records.count) postcode entries successfully imported in \(duration) seconds.") | |
let record = records[Int(arc4random_uniform(UInt32(records.count)))] | |
println("\n".join(map(record) { (key,value) in "\(key):\t\(value)" })) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment