Skip to content

Instantly share code, notes, and snippets.

@yukithm
Created June 29, 2023 06:27
Show Gist options
  • Save yukithm/b79961622bc05af1dcc961ff33ba8a85 to your computer and use it in GitHub Desktop.
Save yukithm/b79961622bc05af1dcc961ff33ba8a85 to your computer and use it in GitHub Desktop.
BOM aware Unicode Reader
package bomreader
import (
"bufio"
"bytes"
"io"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)
type Encoding interface {
String() string
BOM() []byte
}
type encoding struct {
name string
bom []byte
}
func (e encoding) String() string {
return e.name
}
func (e encoding) BOM() []byte {
return e.bom
}
var (
UTF8 = encoding{name: "UTF-8", bom: []byte{0xef, 0xbb, 0xbf}}
UTF16BE = encoding{name: "UTF-16BE", bom: []byte{0xfe, 0xff}}
UTF16LE = encoding{name: "UTF-16LE", bom: []byte{0xff, 0xfe}}
NoBOM = encoding{name: "No BOM", bom: []byte{}}
)
func NewBomReader(r io.Reader) (io.Reader, Encoding, error) {
br := bufio.NewReader(r)
mark, err := br.Peek(3)
if err != nil {
return nil, nil, err
}
if bytes.HasPrefix(mark, UTF8.bom) {
return transform.NewReader(br, unicode.UTF8BOM.NewDecoder()), UTF8, nil
} else if bytes.HasPrefix(mark, UTF16BE.bom) {
return transform.NewReader(br, unicode.UTF16(unicode.BigEndian, unicode.UseBOM).NewDecoder()), UTF16BE, nil
} else if bytes.HasPrefix(mark, UTF16LE.bom) {
return transform.NewReader(br, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewDecoder()), UTF16LE, nil
}
return br, NoBOM, nil
}
@yukithm
Copy link
Author

yukithm commented Jun 29, 2023

A better solution:

// fallback := transform.Nop
fallback := unicode.UTF8.NewDecoder()
r := transform.NewReader(input, unicode.BOMOverride(fallback))

See: https://pkg.go.dev/golang.org/x/text/encoding/unicode#BOMOverride

A complete example:

package main

import (
	"io"
	"os"

	"golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
)

func main() {
	input := os.Stdin
	if len(os.Args) > 1 {
		f, err := os.Open(os.Args[1])
		if err != nil {
			panic(err)
		}
		defer f.Close()
		input = f
	}

	// fallback := transform.Nop
	fallback := unicode.UTF8.NewDecoder()
	r := transform.NewReader(input, unicode.BOMOverride(fallback))

	_, err := io.Copy(os.Stdout, r)
	if err != nil {
		panic(err)
	}
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment