Created
April 5, 2017 12:09
-
-
Save cathalgarvey/ab2449fbe3a8b134f127a97b0d74dd50 to your computer and use it in GitHub Desktop.
How to Read Lines from GZIP-Compressed Files in Go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"compress/gzip" | |
"os" | |
"bufio" | |
"fmt" | |
"log" | |
) | |
// GZLines iterates over lines of a file that's gzip-compressed. | |
// Iterating lines of an io.Reader is one of those things that Go | |
// makes needlessly complex. | |
func GZLines(filename string) (chan []byte, chan error, error) { | |
rawf, err := os.Open(filename) | |
if err != nil { | |
return nil, nil, err | |
} | |
rawContents, err := gzip.NewReader(rawf) | |
if err != nil { | |
return nil, nil, err | |
} | |
contents := bufio.NewScanner(rawContents) | |
cbuffer := make([]byte, 0, bufio.MaxScanTokenSize) | |
contents.Buffer(cbuffer, bufio.MaxScanTokenSize*50) // Otherwise long lines crash the scanner. | |
ch := make(chan []byte) | |
errs := make(chan error) | |
go func(ch chan []byte, errs chan error, contents *bufio.Scanner) { | |
defer func(ch chan []byte, errs chan error){ | |
close(ch) | |
close(errs) | |
}(ch, errs) | |
var ( | |
err error | |
) | |
for contents.Scan() { | |
ch <- contents.Bytes() | |
} | |
if err = contents.Err(); err != nil { | |
errs <- err | |
return | |
} | |
}(ch, errs, contents) | |
return ch, errs, nil | |
} | |
func main() { | |
fmt.Printf("Called on: %+v\n", os.Args) | |
lines, errors, err := GZLines(os.Args[1]) | |
if err != nil { | |
log.Fatal(err) | |
} | |
go func(errs chan error) { | |
err := <- errs | |
log.Fatal(err) | |
}(errors) | |
for foo := range lines { | |
fmt.Printf("%+v\n", string(foo)) | |
} | |
} |
I can't find input data for which the explicit buffer allocation seems to make a difference, unless you're hitting input data with lines over 64k. But it shouldn't matter whether you're doing this to gzipped or uncompressed data.
I made a new version with the suggested changes:
https://gist.github.com/lovasoa/38a207ecdefa1d60225403a644800818
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
My review: