Skip to content

Instantly share code, notes, and snippets.

@airspeedswift
Created May 11, 2015 16:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save airspeedswift/2f4fede44f208ed45340 to your computer and use it in GitHub Desktop.
Save airspeedswift/2f4fede44f208ed45340 to your computer and use it in GitHub Desktop.
CSV-parsing code in Swift
// after http://www.cocoawithlove.com/2009/11/writing-parser-using-nsscanner-csv.html
import Foundation
extension NSScanner {
func scanUpToCharactersFromSet(set: NSCharacterSet) -> String? {
var str: NSString?
return self.scanUpToCharactersFromSet(set, intoString: &str)
? str as? String
: nil
}
func scanCharactersFromSet(set: NSCharacterSet) -> String? {
var str: NSString?
return self.scanCharactersFromSet(set, intoString: &str)
? str as? String
: nil
}
func scanString(match: String) -> String? {
return self.scanString(match, intoString: nil) ? match : nil
}
}
/// Parses a string representing records
struct DelimitedDataParser {
private var scanner_: NSScanner
private let endTextCharacterSet_: NSCharacterSet
private let separator_: String
private let separatorIsSingleChar_: Bool
private var fieldNames_: [String] = []
/// Creates a delimited data parser to parse a supplied string of
/// newline-separated records.
///
/// Parameters:
/// data - the string that will be parsed
/// separator - the separator (normally "," or "\t")
/// header - if true, treats the first row as a list of field names
/// fieldNames - list of field names (has no effect if `header` is true)
init(data: String, separator: String, hasHeader: Bool = false, fieldNames: [String] = []) {
if let firstSeparator = first(separator) {
let endTextCharacterSet = NSMutableCharacterSet.newlineCharacterSet()
endTextCharacterSet.addCharactersInString("\(firstSeparator)\"" )
endTextCharacterSet_ = endTextCharacterSet
}
else {
preconditionFailure("Separator cannot be empty")
}
precondition(!contains(separator, "\""), "Separator cannot contain double-quote")
precondition(!contains(separator.utf16, NSCharacterSet.newlineCharacterSet().characterIsMember ), "Separator cannot contain a new line character")
separator_ = separator
scanner_ = NSScanner(string: data)
scanner_.charactersToBeSkipped = NSCharacterSet()
scanner_.caseSensitive = true
separatorIsSingleChar_ = dropFirst(separator).isEmpty
if hasHeader, let fieldNames = parseHeader() {
fieldNames_ = fieldNames
parseLineSeparator()
}
else {
fieldNames_ = fieldNames
}
}
mutating func array() -> [[String:String]] {
return parseFile()
}
mutating func parseFile() -> [[String:String]] {
var records: [[String:String]] = []
while let record = parseRecord() {
records.append(record)
if parseLineSeparator() == nil { break }
}
return records
}
mutating func parseHeader() -> [String]? {
var names: [String] = []
while let name = parseName() {
names.append(name)
if parseSeparator() == nil { break }
}
return names.isEmpty ? nil : names
}
mutating func parseDoubleQuote() -> String? {
return scanner_.scanString("\"")
}
mutating func parseTwoDoubleQuotes() -> String? {
return scanner_.scanString("\"\"")
}
mutating func parseSeparator() -> String? {
return scanner_.scanString(separator_)
}
mutating func parseLineSeparator() -> String? {
return scanner_.scanCharactersFromSet(.newlineCharacterSet())
}
mutating func parseName() -> String? { return parseField() }
mutating func parseNonEscaped() -> String? { return parseTextData() }
mutating func parseField() -> String? {
if let escapedString = self.parseEscaped() {
return escapedString
}
if let nonEscapedString = self.parseNonEscaped() {
return nonEscapedString
}
//
// Special case: if the current location is immediately
// followed by a separator, then the field is a valid, empty string.
//
let currentLocation = scanner_.scanLocation
if self.parseSeparator() != nil || self.parseLineSeparator() != nil || scanner_.atEnd {
scanner_.scanLocation = currentLocation
return ""
}
return nil
}
mutating func parseHeader() -> [String] {
var names: [String] = []
while let name = parseName() {
names.append(name)
if self.parseSeparator() == nil {
return names
}
}
return names
}
mutating func parseTextData() -> String? {
var accumulatedData = ""
while true {
if let fragment = scanner_.scanUpToCharactersFromSet(endTextCharacterSet_) {
accumulatedData += fragment
}
if separatorIsSingleChar_ {
break
}
let location = scanner_.scanLocation
if let firstCharOfSeparator = first(separator_),
let first = scanner_.scanString(String(firstCharOfSeparator)) {
if let next = scanner_.scanString(dropFirst(separator_)) {
scanner_.scanLocation = location
break
}
accumulatedData.append(firstCharOfSeparator)
continue
}
else {
break
}
}
return accumulatedData.isEmpty ? nil : accumulatedData
}
mutating func parseEscaped() -> String? {
if self.parseDoubleQuote() == nil {
return nil
}
var accumulatedData = ""
while let fragment =
parseTextData()
?? parseSeparator()
?? parseLineSeparator()
?? parseTwoDoubleQuotes().map({_ in "\""})
{
accumulatedData += fragment
}
if self.parseDoubleQuote() == nil {
return nil
}
return accumulatedData
}
mutating func parseRecord() -> [String:String]? {
//
// Special case: return nil if the line is blank. Without this special case,
// it would parse as a single blank field.
//
if self.parseLineSeparator() != nil || scanner_.atEnd {
return nil
}
var fieldCount = 0
var record: [String:String] = [:]
while let field = self.parseField() {
let fieldName: String
if fieldCount < fieldNames_.count {
fieldName = fieldNames_[fieldCount]
}
else {
fieldName = "FIELD_\(fieldCount + 1)"
fieldNames_.append(fieldName)
}
record[fieldName] = field
fieldCount++
if self.parseSeparator() == nil {
break
}
}
return record
}
}
var error: NSError?
if Process.argc != 2 {
println("Usage: \(Process.arguments[0]) <pathToInputCSVFile>")
exit(1)
}
let inputPath = Process.arguments[1]
let csvData: String
if let csvString = NSString(contentsOfFile: inputPath, encoding: NSUTF8StringEncoding, error: &error) {
csvData = csvString as String
}
else {
println("Couldn\'t read file at path \(inputPath)")
if let err = error?.localizedDescription ?? error?.description { println("Error: \(err)") }
exit(1)
}
let startDate = NSDate()
let fieldNames = [
"postcode",
"suburb",
"state",
"postOffice",
"type",
"latitude",
"longitude",
]
var parser = DelimitedDataParser(data: csvData, separator: ",", fieldNames: fieldNames)
let records = parser.array()
let duration = NSDate().timeIntervalSinceDate(startDate)
NSLog("\(records.count) postcode entries successfully imported in \(duration) seconds.")
let record = records[Int(arc4random_uniform(UInt32(records.count)))]
println("\n".join(map(record) { (key,value) in "\(key):\t\(value)" }))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment