Skip to content

Instantly share code, notes, and snippets.

@awmv
Created April 21, 2021 08:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save awmv/f5fee45b874f31870dd7ca3f948862a0 to your computer and use it in GitHub Desktop.
Save awmv/f5fee45b874f31870dd7ca3f948862a0 to your computer and use it in GitHub Desktop.
Convert CSV to XLSX
package main
import (
"bufio"
"encoding/csv"
"errors"
"flag"
"fmt"
"io"
"math"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"time"
"unicode/utf8"
"github.com/tealeg/xlsx"
)
// listEncoders is a helper function to display the list
// of supported encodings on standard output
func listEncoders() {
names := make([]string, 0, len(encoders))
for name := range encoders {
names = append(names, name)
}
sort.Strings(names) //sort by key
for enc := range names {
fmt.Println(names[enc])
}
}
// ParseFloat ist an advanced ParseFloat for golang, support scientific notation, comma separated number
// from yyscamper at https://gist.github.com/yyscamper/5657c360fadd6701580f3c0bcca9f63a
func ParseFloat(str string) (float64, error) {
val, err := strconv.ParseFloat(str, 64)
if err == nil {
return val, nil
}
//Some number may be seperated by comma, for example, 23,120,123, so remove the comma firstly
str = strings.Replace(str, ",", "", -1)
//Some number is specifed in scientific notation
pos := strings.IndexAny(str, "eE")
if pos < 0 {
return strconv.ParseFloat(str, 64)
}
var baseVal float64
var expVal int64
baseStr := str[0:pos]
baseVal, err = strconv.ParseFloat(baseStr, 64)
if err != nil {
return 0, err
}
expStr := str[(pos + 1):]
expVal, err = strconv.ParseInt(expStr, 10, 64)
if err != nil {
return 0, err
}
return baseVal * math.Pow10(int(expVal)), nil
}
// parseCommaGroup parses a single comma group (x or x-y),
// optionally followed by :datatype (used only for columns right now)
// It returns a map with row or column index as key and the datatype as value
func parseCommaGroup(grpstr string) (map[int]string, error) {
var err error
var startVal int
var endVal int
result := make(map[int]string)
// we need exactly one number or an a-b interval (2 number parts)
parts := strings.Split(grpstr, "-")
if len(parts) < 1 || len(parts) > 2 {
return nil, fmt.Errorf("Invalid range group '%s' found.", grpstr)
}
// check for type (currently needed only for columns, will be ignored for lines)
datatype := "standard"
// last item may have type spec
if strings.Index(parts[len(parts)-1], ":") >= 0 {
datatype = strings.Split(parts[len(parts)-1], ":")[1]
parts[len(parts)-1] = strings.Split(parts[len(parts)-1], ":")[0]
}
// first number
startVal, err = strconv.Atoi(parts[0])
if err == nil {
result[startVal] = datatype
}
// interval?
if len(parts) == 2 {
endVal, err = strconv.Atoi(parts[1])
if err == nil {
for i := startVal + 1; i <= endVal; i++ {
result[i] = datatype
}
}
}
return result, err
}
// parseRangeString parses a comma-separated list of range groups.
// It returns a map with row or column index as key and the datatype as value
// As the data source has to be valid, this functions exits the program on parse errors
func parseRangeString(rangeStr string) map[int]string {
result := make(map[int]string)
for _, part := range strings.Split(rangeStr, ",") {
indexlist, err := parseCommaGroup(part)
if err != nil {
fmt.Println("Invalid range, exiting.")
os.Exit(INVALID_RANGE)
}
for key, val := range indexlist {
result[key] = val
}
}
return result
}
// ParseCommandLine defines and parses command line flags and checks for usage info flags.
// The function exits the program, if the input file does not exist
func parseCommandLine() {
var headerString = ""
cmdlineFlags := flag.NewFlagSet(os.Args[0], flag.ContinueOnError)
cmdlineFlags.StringVar(&parmInFile, "infile", "", "full pathname of input file (CSV file)")
cmdlineFlags.StringVar(&parmOutFile, "outfile", "", "full pathname of output file (.xlsx file)")
cmdlineFlags.StringVar(&parmFileMask, "filemask", "", "file mask for bulk processing (overwrites -infile/-outfile)")
cmdlineFlags.StringVar(&parmOutDir, "outdir", "", "target directory for the .xlsx file (not to be used with outfile)")
cmdlineFlags.StringVar(&parmDateFormat, "dateformat", "2006-01-02", "format for CSV date cells (default YYYY-MM-DD)")
cmdlineFlags.StringVar(&parmExcelDateFormat, "exceldateformat", "", "Excel format for date cells (default as in Excel)")
cmdlineFlags.StringVar(&parmCols, "columns", "", "column range to use (see below)")
cmdlineFlags.StringVar(&parmRows, "rows", "", "list of line numbers to use (1,2,8 or 1,3-14,28)")
cmdlineFlags.StringVar(&parmSheet, "sheet", "fromCSV", "tab name of the Excel sheet")
cmdlineFlags.StringVar(&tmpStr, "colsep", "|", "column separator (default '|') ")
cmdlineFlags.StringVar(&parmEncoding, "encoding", "utf-8", "character encoding")
cmdlineFlags.StringVar(&parmFontName, "fontname", "Arial", "set the font name to use")
cmdlineFlags.StringVar(&headerString, "headerlabels", "", "comma-separated list of header labels (enclose in quotes to be safe)")
cmdlineFlags.Float64Var(&parmFontSize, "fontsize", 12, "set the default font size to use")
cmdlineFlags.IntVar(&parmHeaderLines, "headerlines", 1, "set the number of header lines (use 0 for no header)")
cmdlineFlags.BoolVar(&parmNoHeader, "noheader", false, "DEPRECATED (use headerlines) no header, only data lines")
cmdlineFlags.BoolVar(&parmAbortOnError, "abortonerror", false, "abort program on first invalid cell data type")
cmdlineFlags.BoolVar(&parmSilent, "silent", false, "do not display progress messages")
cmdlineFlags.BoolVar(&parmAutoFormula, "autoformula", false, "automatically format string starting with = as formulae")
cmdlineFlags.BoolVar(&parmHelp, "help", false, "display usage information")
cmdlineFlags.BoolVar(&parmHelp, "h", false, "display usage information")
cmdlineFlags.BoolVar(&parmHelp, "?", false, "display usage information")
cmdlineFlags.BoolVar(&parmShowVersion, "version", false, "display version information")
cmdlineFlags.BoolVar(&parmIgnoreEmpty, "ignoreempty", true, "do not display warnings for empty cells")
cmdlineFlags.BoolVar(&parmOverwrite, "overwrite", false, "overwrite existing output file (default false)")
cmdlineFlags.BoolVar(&parmAppendToSheet, "append", false, "append data rows to specified sheet instead of overwriting sheet")
cmdlineFlags.BoolVar(&parmListEncoders, "listencodings", false, "display a list of supported encodings and exit")
cmdlineFlags.IntVar(&parmStartRow, "startrow", 1, "start at row N in CSV file (this value is 1-based!)")
cmdlineFlags.StringVar(&parmNaNValue, "nanvalue", "", "value to be used for failed number conversions or missing numbers")
cmdlineFlags.BoolVar(&parmDebug, "debug", false, "display debug messages")
err := cmdlineFlags.Parse(os.Args[1:])
if err != nil || !cmdlineFlags.Parsed() {
fmt.Println("")
fmt.Println("Could not parse the command line, invalid option or value.")
fmt.Println("Error:", err)
fmt.Println("")
os.Exit(INVALID_ARGUMENTS)
}
if parmHelp {
fmt.Printf("You are running version %s of %s\n\n", versionInfo, os.Args[0])
cmdlineFlags.Usage()
fmt.Println(`
Column ranges are a comma-separated list of numbers (e.g. 1,4,8,16), intervals (e.g. 0-4,18-32) or a combination.
Each comma group can have type specifier for the columns, separated with a colon (e.g. 0:text,3-16:number,17:date)
Type is one of: text|number|integer|currency|date|standard|percent|formula|format
Type "format" may be used together with a format string: format="FMTSTR", e.g. 2:format="0000.0"
`)
os.Exit(SHOW_USAGE)
}
t, err := strconv.Unquote(`"` + tmpStr + `"`)
if err != nil {
fmt.Println("Invalid column separator specified, exiting.")
os.Exit(INVALID_COLSEP)
}
parmColSep, _ = utf8.DecodeRuneInString(t)
if parmShowVersion {
fmt.Println("Version", versionInfo)
os.Exit(SHOW_USAGE)
}
r := strings.NewReplacer("YYYY", "2006",
"MM", "01",
"DD", "02",
"HH", "15",
"MI", "04",
"SS", "05",
"ZN", "-7000",
"ZC", "-07:00")
// Replace all pairs.
parmDateFormat = r.Replace(parmDateFormat)
// do we have user defined header labels?
parmHeaderLabels = []string{}
if headerString != "" {
parmHeaderLabels = strings.Split(headerString, ",")
for i := range parmHeaderLabels {
parmHeaderLabels[i] = strings.TrimSpace(parmHeaderLabels[i])
}
}
if parmListEncoders {
listEncoders()
os.Exit(SHOW_USAGE)
}
if parmOutFile != "" && parmOutDir != "" {
fmt.Println("Cannot use -outfile and -outdir together (-outdir to be used with -filemask), exiting.")
os.Exit(INVALID_ARGUMENTS)
}
if parmFileMask == "" {
if _, err := os.Stat(parmInFile); os.IsNotExist(err) {
fmt.Println("Input file does not exist, exiting.")
os.Exit(INPUTFILE_NOT_FOUND)
}
}
}
// loadInputFile reads the complete input file into a matrix of strings.
// currently there is not need for gigabyte files, but maybe this should be done streaming.
// in addition, we need row and column counts first to set the default ranges later on in the program flow.
func loadInputFile(filename string) (rows [][]string, err error) {
var rdr io.Reader
// check if file exists
if _, err := os.Stat(filename); os.IsNotExist(err) {
return nil, errors.New(fmt.Sprintf("Input file %s does not exist", filename))
}
// open input file
f, err := os.Open(filename)
if err != nil {
return nil, errors.New(fmt.Sprintf("Error opening input file %s", filename))
}
// check encoding for input file
encname := strings.ToUpper(parmEncoding)
if encname == "UTF8" || encname == "UTF-8" {
rdr = bufio.NewReader(f)
} else {
if enc, ok := encoders[encname]; ok {
rdr = enc.NewDecoder().Reader(f)
} else {
fmt.Println(fmt.Sprintf("Specified encoding \"%s\" not found, defaulting to UTF-8\n", parmEncoding))
rdr = bufio.NewReader(f)
}
}
// read file data
r := csv.NewReader(rdr)
r.Comma = parmColSep
r.FieldsPerRecord = -1
r.LazyQuotes = true
rows, err = r.ReadAll()
if err != nil {
msg := fmt.Sprintf("Error reading CSV file %s", filename)
closeErr := f.Close()
if closeErr != nil {
msg = fmt.Sprintf("Error closing file %s", filename) + msg
}
return nil, errors.New(msg)
}
closeErr := f.Close()
if closeErr != nil {
msg := fmt.Sprintf("Error closing file %s", filename)
return nil, errors.New(msg)
}
return rows, nil
}
// setRangeInformation uses the input file's row and column count to set the default ranges
// for lines and columns. of course we could leave this out by improving the parser function
// at parseRangeString to allow something like line 34- (instead of 34-999). It's on the list ...
func setRangeInformation(rowCount, colCount int) {
start := parmStartRow - 1
// now we can set the default ranges for lines and columns
if parmRows == "" {
parmRows = fmt.Sprintf("%d-%d", start, rowCount)
}
if parmCols == "" {
parmCols = fmt.Sprintf("0-%d", colCount)
}
// will bail out on parse error, see declaration
rowRangeParsed = parseRangeString(parmRows)
colRangeParsed = parseRangeString(parmCols)
}
// writeCellContents is basically a boring comparison which data type should be written
// to the spreadsheet cell. if the function encounters invalid values for the data type,
// it outputs an error message and ignores the value
func writeCellContents(cell *xlsx.Cell, colString, colType string, rownum, colnum int) bool {
success := true
theStyle := leftAligned // let's assume left-aligned
// check for content to process and
// process the "Ignore Warnings On Empty Cells" flag
if colString == "" {
if !parmIgnoreEmpty {
fmt.Println(fmt.Sprintf("Warning: Cell (%d, %d) is empty.", rownum, colnum))
}
return true
}
// only convert to formula if the user specified --autoformula,
// otherwise use the defined type from column range -- for lazy people :-)
if parmAutoFormula && strings.HasPrefix(colString, "=") {
colType = "formula"
}
// special treatment of "format" column type
fmtstring := ""
if strings.HasPrefix(colType, "format") {
parts := strings.Split(colType, "=")
if len(parts) > 1 {
colType = parts[0]
fmtstring = parts[1]
}
}
// type dependent write
//fmt.Println("==>", colType)
switch colType {
case "format":
floatVal, err := ParseFloat(colString)
if err != nil {
fmt.Println(fmt.Sprintf("Cell (%d,%d) is not a valid number, value: %s", rownum, colnum, colString))
success = false
} else {
theStyle = rightAligned
cell.SetFloatWithFormat(floatVal, fmtstring)
// cell.SetFloatWithFormat(floatVal, fmtstring)
}
case "text":
cell.SetString(colString)
case "number", "currency":
floatVal, err := ParseFloat(colString)
if err != nil {
if parmNaNValue != "" {
cell.SetString(parmNaNValue)
success = true
} else {
fmt.Println(fmt.Sprintf("Cell (%d,%d) is not a valid number, value: %s", rownum, colnum, colString))
success = false
}
} else {
theStyle = rightAligned
if colType == "currency" {
cell.SetFloatWithFormat(floatVal, "#,##0.00;[red](#,##0.00)")
} else {
// if it's an int, don't use precision
if floatVal == math.Trunc(floatVal) {
cell.SetFloatWithFormat(floatVal, "#")
} else {
cell.SetFloatWithFormat(floatVal, "0.###")
}
}
}
case "integer":
intVal, err := strconv.ParseInt(colString, 10, 64)
if err != nil {
fmt.Println(fmt.Sprintf("Cell (%d,%d) is not a valid integer, value: %s", rownum, colnum, colString))
success = false
} else {
theStyle = rightAligned
cell.SetInt64(intVal)
cell.NumFmt = "#0"
}
case "date":
dt, err := time.Parse(parmDateFormat, colString)
if err != nil {
fmt.Println(fmt.Sprintf("Cell (%d,%d) is not a valid date, value: %s", rownum, colnum, colString))
success = false
} else {
cell.SetDateTime(dt)
if parmExcelDateFormat != "" {
cell.NumFmt = parmExcelDateFormat
}
}
case "formula":
// colstring =<formula>
cell.SetFormula(colString[1:])
case "percent":
// thanks to Felipe Augusto da Silva for the improvement to use "percent"
floatVal, err := ParseFloat(colString)
if err != nil {
fmt.Println(fmt.Sprintf("Cell (%d,%d) is not a valid number, value: %s", rownum, colnum, colString))
success = false
} else {
theStyle = rightAligned
cell.SetFloatWithFormat(floatVal, "0.00%")
}
default:
cell.SetValue(colString)
_, err := ParseFloat(colString)
if err == nil {
theStyle = rightAligned
}
}
cell.SetStyle(theStyle)
return success
}
// processDataColumns processes a row from the csv input file and writes a cell for each column
// that should be processed (is in column range, which means it is a key in the colRangeParsed map.
// if the abortOnError option is set, the function exits the program on the first data type error.
func processDataColumns(excelRow *xlsx.Row, rownum int, csvLine []string) {
if parmDebug {
fmt.Println(fmt.Sprintf("Processing csvLine %d (%d cols)", rownum, len(csvLine)))
}
for colnum := 0; colnum < len(csvLine); colnum++ {
colType, processColumn := colRangeParsed[colnum]
if processColumn {
cell := excelRow.AddCell()
isHeader := (parmHeaderLines > 0) || !parmNoHeader
if isHeader && (rownum < parmHeaderLines) {
// special case for the title row
if len(parmHeaderLabels) > 0 && len(parmHeaderLabels) > colnum {
cell.SetString(parmHeaderLabels[colnum])
} else {
cell.SetString(csvLine[colnum])
}
cell.SetStyle(leftAligned)
} else {
// if the user wanted drama (--abortonerror), exit on first error
ok := writeCellContents(cell, csvLine[colnum], colType, rownum, colnum)
if !ok && parmAbortOnError {
os.Exit(WRITE_ERROR)
}
}
}
}
}
// getInputFiles retrieves a list of input files for a given filespec
// returns a slice of strings or aborts the program on error
func getInputFiles(inFileSpec string) []string {
files, err := filepath.Glob(inFileSpec)
if err != nil {
fmt.Println(err)
os.Exit(INPUTFILE_NOT_FOUND)
}
return files
}
// buildOutputName generates the .xlsx file name for a given input file
// if the user specified the -outdir option, use this directory as target path
// return a string with the target file name
func buildOutputName(infile string) string {
outfile := strings.TrimSuffix(infile, filepath.Ext(infile)) + ".xlsx"
if parmOutFile != "" {
outfile = parmOutFile
}
if parmOutDir != "" {
if _, err := os.Stat(parmOutDir); err == nil {
outfile = filepath.Join(parmOutDir, filepath.Base(outfile))
} else {
fmt.Println(fmt.Sprintf("Output directory %q does not exist, exiting.", parmOutDir))
os.Exit(OUTPUTDIR_NOT_FOUND)
}
}
return outfile
}
// openOrCreateFile checks if the specified filename exists and
// tries to read the file subsequently. If the file does not exist,
// a new file instance is created
func openOrCreateFile(filename string) (*xlsx.File, error) {
var err error
var f *xlsx.File
_, err = os.Stat(filename)
// we want a new file if there no existing file OR
// if there is an existing file, but we want to overwrite it,
// this was nasty bug
if os.IsNotExist(err) || parmOverwrite {
f = xlsx.NewFile()
err = nil
} else {
f, err = xlsx.OpenFile(filename)
}
return f, err
}
// getWorkSheet retrieves the specified sheet from the workbook
// if the sheet does not exist, it is appended to the file.
// Returns a pointer to the sheet or nil in case of error
func getWorkSheet(sheetName string, workBook *xlsx.File, appendSheet bool) *xlsx.Sheet {
var sh *xlsx.Sheet
var ok bool
var err error
sh, ok = workBook.Sheet[sheetName]
if !ok {
// sheet not found, try to append a sheet with that name
sh, err = workBook.AddSheet(sheetName)
if err != nil {
// could not add the sheet
fmt.Println(err)
sh = nil
}
}
return sh
}
// convertFile does the conversion from CSV to Excel .xslx
func convertFile(infile, outfile string) bool {
if _, err := os.Stat(outfile); err == nil {
if !parmOverwrite && !parmAppendToSheet {
fmt.Println(fmt.Sprintf("Output file %s exists, skipping (use --overwrite?)", outfile))
return false
}
}
rows, err := loadInputFile(infile)
if err != nil {
fmt.Println(err)
return false
}
setRangeInformation(len(rows), len(rows[0]))
// excel stuff, create file, add worksheet, define a right-aligned style
xlsx.SetDefaultFont(parmFontSize, parmFontName)
rightAligned = &xlsx.Style{}
rightAligned.Alignment = xlsx.Alignment{Horizontal: "right"}
rightAligned.Font.Name = parmFontName
rightAligned.Font.Size = parmFontSize
leftAligned = &xlsx.Style{}
leftAligned.Alignment = xlsx.Alignment{Horizontal: "left"}
leftAligned.Font.Name = parmFontName
leftAligned.Font.Size = parmFontSize
workBook, err = openOrCreateFile(outfile)
if err != nil {
fmt.Println(err)
return false
}
workSheet = getWorkSheet(parmSheet, workBook, parmAppendToSheet)
if workSheet == nil {
fmt.Printf("Could not add worksheet [%s], exiting.\n", parmSheet)
os.Exit(INVALID_SHEET_NAME)
}
// loop thru line and column ranges and process data cells
for rownum := 0; rownum < len(rows); rownum++ {
_, processLine := rowRangeParsed[rownum]
if processLine {
line := rows[rownum]
excelRow := workSheet.AddRow()
processDataColumns(excelRow, rownum, line)
}
}
err = workBook.Save(outfile)
if err != nil {
fmt.Println(err)
os.Exit(EXCEL_SAVE_ERROR)
}
return true
}
func main() {
var fileList []string
parseCommandLine()
// either glob the file mask or retrieve the single input file
// this way we can just iterate over the slice (and maybe later
// add an option for a specified list of files)
if parmFileMask != "" {
if !parmSilent {
fmt.Println("Found filemask parameter, running in bulk mode")
}
fileList = getInputFiles(parmFileMask)
} else {
fileList = getInputFiles(parmInFile)
}
// iterate over list of files to process and convert them
for _, infile := range fileList {
outfile := buildOutputName(infile)
if !parmSilent {
fmt.Println("Converting", infile, "=>", outfile)
}
ok := convertFile(infile, outfile)
if !ok {
fmt.Println(fmt.Sprintf("Could not convert input file %s", infile))
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment