Skip to content

Instantly share code, notes, and snippets.

@dilap
Last active August 29, 2015 14:01
Show Gist options
  • Save dilap/46ef747dbd642ef6e834 to your computer and use it in GitHub Desktop.
Save dilap/46ef747dbd642ef6e834 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"fmt"
"log"
"os"
"runtime"
)
// (1) splits on tab-delimitted fields; handles quotes (and quote-escaped quotes w/in quotes) properly
// (2) substitues spaces for newlines and tabs (allowing the output to be printed safely w/o quotes)
// caveats: - no error reporting
// - (1) and (2) are separate functions, really, but combined here for a bit of a speed boost (~10%)
func substitutingSplitTabsFunc(newRow *bool) bufio.SplitFunc {
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
if len(data) == 0 || data[0] != '"' { // easy case: unquoted field
i := 0
for i < len(data) && data[i] != '\t' && data[i] != '\n' {
i++
}
if i == len(data) && !atEOF {
// request more data
return 0, nil, nil
}
*newRow = i == len(data) || data[i] == '\n'
if i == len(data) {
return len(data), data, nil
} else {
return i + 1, data[:i], nil
}
} else { // hard case: quoted field
i := 1
inlineQuotes, inlineNewline := false, false
for {
if i == len(data) {
if !atEOF { // need more data
return 0, nil, nil
} else {
// todo: error, unclosed quote
advance = len(data)
*newRow = true
break
}
} else if data[i] == '"' {
if i+1 == len(data) {
if !atEOF { // request more data
return 0, nil, nil
} else {
*newRow = true
advance = len(data)
break
}
} else if data[i+1] == '\n' {
*newRow = true
advance = i + 2
break
} else if data[i+1] == '\t' {
*newRow = false
advance = i + 2
break
} else {
inlineQuotes = true
}
} else if data[i] == '\n' || data[i] == '\t' {
inlineNewline = true
}
i++
}
token = data[1:i]
if inlineQuotes {
i, t := 0, 0
for i < len(token) {
token[t] = token[i]
if token[i] == '"' {
if i+1 < len(token) && token[i+1] == '"' {
i++
}
// todo: error: solitary quote char
}
i++
t++
}
token = token[:t]
}
if inlineNewline {
for i, c := range token {
if c == '\n' || c == '\t' {
token[i] = ' '
}
}
}
return
}
}
}
// Make scanner to split on tab-delimitted, possibly-quoted csv fields. Each call to scanner.Scan() will return the next field; when the returned field is the last field in a row, newRow will be true. This method of parsing CSV is (as of this writing, Oct 2013), much faster than the built-in csv module. Also replaces tabs and newlines with spaces.
func SplitTabsAndSub(s *bufio.Scanner) (newRow *bool) {
newRow = new(bool)
s.Split(substitutingSplitTabsFunc(newRow))
return
}
func main() {
runtime.GOMAXPROCS(2)
if len(os.Args) < 2 {
fmt.Println(`usage: %s FNAME > CLEANFNAME
Read tab-separated file FNAME and write a simplified version to
stdout. Simplified => tabs and newlines are replaced with spaces, no quotes
are used
`, os.Args[0])
os.Exit(64)
}
f, err := os.Open(os.Args[1])
if err != nil {
log.Fatal(err)
}
scanner := bufio.NewScanner(f)
newRow := SplitTabsAndSub(scanner)
w := bufio.NewWriter(os.Stdout)
newline, tab := []byte("\n"), []byte("\t")
for scanner.Scan() {
w.Write(scanner.Bytes())
if *newRow {
w.Write(newline)
} else {
w.Write(tab)
}
}
w.Flush()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment