Skip to content

Instantly share code, notes, and snippets.

@skitazaki
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save skitazaki/c8245af0dc57ea77e39a to your computer and use it in GitHub Desktop.
Save skitazaki/c8245af0dc57ea77e39a to your computer and use it in GitHub Desktop.
Read CSV file and report blank counts of each field.
package main
import (
"bufio"
"encoding/csv"
"fmt"
log "github.com/Sirupsen/logrus"
"github.com/docopt/docopt-go"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/transform"
"io"
"os"
"unicode/utf8"
)
/**
* https://github.com/thbar/golang-playground/blob/master/csv-parsing.go
*/
func main() {
// Arguments common to all utilities — csvkit 1.0.0 documentation
// http://csvkit.readthedocs.org/en/latest/common_arguments.html
usage := `usage: csv-blank-counter [--version] [--help]
[--no-header-row] [--delimiter=<delimiter>|--tabs]
[--encoding=<encoding>] [--strict]
[<file>]
Options:
-h --help Show this screen.
--version Show version.
--header Given file has header line on the first.
-d DELIMITER, --delimiter DELIMITER
Delimiting character of the input CSV file.
-t, --tabs Specifies that the input CSV file is delimited with tabs.
Overrides "-d".
-e ENCODING, --encoding=ENCODING Input file encoding.
--strict Strictly check fields number.
-H, --no-header-row Specifies that the input CSV file has no header row.
`
args, _ := docopt.Parse(usage, nil, true, "csv-blank-counter 0.2.0", false)
var buffer *bufio.Reader
var logfileds log.Fields
if args["<file>"] != nil {
fname := args["<file>"].(string)
fp, err := os.Open(fname)
// TODO: Check `file` is file or directory.
// http://www.reddit.com/r/golang/comments/2fjwyk/isdir_in_go/
if err != nil {
fmt.Println("Error:", err)
return
}
defer fp.Close()
buffer = bufio.NewReader(fp)
logfileds = log.Fields{"file": fname}
} else {
buffer = bufio.NewReader(os.Stdin)
}
logger := log.WithFields(logfileds)
var reader *csv.Reader
if args["--encoding"] != nil {
encoding := args["--encoding"].(string)
if encoding == "sjis" {
logger.Info("Use ShiftJIS decoder")
decoder := japanese.ShiftJIS.NewDecoder()
r := transform.NewReader(buffer, decoder)
reader = csv.NewReader(r)
} else {
logger.Warn("Unknown encoding: ", encoding)
reader = csv.NewReader(buffer)
}
} else {
reader = csv.NewReader(buffer)
}
if args["--tabs"].(bool) {
reader.Comma = '\t'
} else if args["--delimiter"] != nil {
delimiter := args["--delimiter"].(string)
comma, _ := utf8.DecodeRuneInString(delimiter)
reader.Comma = comma
}
reader.Comment = '#'
if args["--strict"].(bool) {
reader.FieldsPerRecord = 0
} else {
reader.FieldsPerRecord = -1
}
header := make(map[int]string)
if !args["--no-header-row"].(bool) {
// Use first line as header name if flag is not specified.
record, err := reader.Read()
if err == io.EOF {
return
} else if err != nil {
logger.Error(err)
}
for i := 0; i < len(record); i++ {
header[i] = record[i]
}
logger.Info("Start parsing with ", len(header), " columns")
} else {
logger.Info("Start parsing without header row")
}
lineCount := 0
errCount := 0
nullColumn := make(map[int]int)
for {
record, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
logger.Error(err, ", Line", lineCount)
lineCount++
errCount++
if errCount > 10 {
logger.Error("Too many error lines, exiting...")
return
}
continue
}
nullCount := 0
for i := 0; i < len(record); i++ {
_, ok := header[i]
if !ok {
header[i] = fmt.Sprintf("Column%03d", i+1)
}
if len(record[i]) == 0 {
nullCount++
nullColumn[i]++
}
}
if nullCount > 0 {
logger.Debug("Line", lineCount, "has", len(record), "fields with", nullCount, "NULL.")
}
lineCount++
if lineCount%1000000 == 0 {
logger.Info("==> Processed ", lineCount, " lines <==")
}
}
logger.Info("End parsing ", lineCount, " lines with ", len(header), " columns")
fmt.Println("## Show the summary")
fmt.Println("#Lines", lineCount, "with", errCount, "errors")
fmt.Println("#Columns", len(header))
for i := 0; i < len(header); i++ {
cnt, ok := nullColumn[i]
if ok {
fmt.Printf(" [%3d]\"%s\" has %d blank(s). (%.2f%%)\n",
i+1, header[i], cnt, float64(cnt)/float64(lineCount)*100)
} else {
fmt.Printf(" [%3d]\"%s\" are all filled.\n", i+1, header[i])
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment