Skip to content

Instantly share code, notes, and snippets.

@brianmario
Created November 19, 2013 09:40
Show Gist options
  • Save brianmario/7542814 to your computer and use it in GitHub Desktop.
Save brianmario/7542814 to your computer and use it in GitHub Desktop.
A simple binary and text encoding detector wrapping libicu's encoding detection API
package main
/*
#cgo CFLAGS: -I/opt/boxen/homebrew/opt/icu4c/include
#cgo LDFLAGS: -L/opt/boxen/homebrew/opt/icu4c/lib -licui18n -licuuc
#include <stdlib.h>
#include <unicode/ucsdet.h>
*/
import "C"
import (
"fmt"
"unsafe"
"os"
"bytes"
"strings"
)
type Detection struct {
Encoding string
Confidence int32
Language string
}
/* You'll obviously want to handle this other than exit(n) lol
*
* We can also get an error string back from the code, but it's not
* super helpful unfortunately. I think ideally we'd just have a map
* of ICU error codes to Go errors so we could quickly return an error
* or nil.
*/
func checkStatus(status C.UErrorCode) {
if status > C.U_ZERO_ERROR {
fmt.Printf("Error %q\n", status)
os.Exit(int(status))
}
}
type Bom struct {
sig []byte
name string
}
var validBoms = []*Bom{
&Bom{[]byte("\xEF\xBB\xBF"), "UTF-8"},
&Bom{[]byte("\xFE\xFF"), "UTF-16BE"},
&Bom{[]byte("\xFF\xFE"), "UTF-16LE"},
&Bom{[]byte("\x00\x00\xFE\xFF"), "UTF-32BE"},
&Bom{[]byte("\xFF\xFE\x00\x00"), "UTF-32LE"},
}
/* Public: An extremely basic binary detector.
* It's a little smarter than just checking for a null byte within
* the first 1kb but not by much.
*
* Returns a bool. true if binary, false if not.
*/
func IsBinary(data string) bool {
// First check and see if it's got a BOM
for _, bom := range validBoms {
if bytes.HasPrefix([]byte(data), bom.sig) {
return false
}
}
// Next, check and see if there are any null bytes
if strings.ContainsRune(data[:1024], 0) {
return true
}
// If no null byte was found, it's pretty unlikely that it's binary
// but this is just a really dumb guess.
return false
}
/* Public: Detect the encoding of a string of text.
* This could maybe be refactored to run in it's own goroutine,
* but encoding detection is REALLY, really fast. Especially on a
* fixed size of data. And given the sizes we're talking about
* (1k or 32k or whatever) the speed should be pretty consitent.
*
* Also we don't need to allocate the entire Detection struct
* if all you really care about is the encoding name string.
*
* Returns a newly allocated Detection with it's Encoding, Confidence
* and Language fields set.
*/
func DetectEncoding(data string) *Detection {
var status C.UErrorCode = C.U_ZERO_ERROR
var match *C.UCharsetMatch
// create a new detector and schedule it's free
detector := C.ucsdet_open(&status)
checkStatus(status)
defer C.ucsdet_close(detector)
// This kinda sucks because it makes a copy of input for use in C
// as a result we must free the string ourselves once we're done
cStr := C.CString(data)
defer C.free(unsafe.Pointer(cStr))
cStrLen := C.int32_t(len(data))
/**
* Set the input byte data whose charset is to detected.
*
* Ownership of the input text byte array remains with the caller.
* The input string must not be altered or deleted until the charset
* detector is either closed or reset to refer to different input text.
*/
status = C.U_ZERO_ERROR
C.ucsdet_setText(detector, cStr, cStrLen, &status)
checkStatus(status)
/** Set the declared encoding for charset detection.
* The declared encoding of an input text is an encoding obtained
* by the user from an http header or xml declaration or similar source that
* can be provided as an additional hint to the charset detector.
*/
// cEncHint := C.CString("UTF-8")
// defer C.free(unsafe.Pointer(cEncHint))
// C.ucsdet_setDeclaredEncoding(detector, cEncHint, 5, &status)
/**
* Enable filtering of input text. If filtering is enabled,
* text within angle brackets ("<" and ">") will be removed
* before detection, which will remove most HTML or xml markup.
*
* the second parameter is a bool 1/0 for true/false
*/
// C.ucsdet_enableInputFilter(detector, 1)
// perform the actual detection
status = C.U_ZERO_ERROR
match = C.ucsdet_detect(detector, &status)
checkStatus(status)
status = C.U_ZERO_ERROR
enc := C.ucsdet_getName(match, &status)
checkStatus(status)
status = C.U_ZERO_ERROR
lang := C.ucsdet_getLanguage(match, &status)
checkStatus(status)
detection := new(Detection)
detection.Encoding = C.GoString(enc)
detection.Confidence = int32(C.ucsdet_getConfidence(match, &status))
detection.Language = C.GoString(lang)
return detection
}
/* This basically replicates what charlock_holmes does.
* 1. try and detect if the content is binary
* 2. if not, try and detect it's text encoding
*
* The main difference is that the binary detection isn't using
* libmagic here, like in charlock_holmes. That library is atrocious
* and I want to stop using it in charlock_holmes anyway so the
* binary detection code used here is what I was planning on doing
* in charlock to replace libmagic. It won't match as accurately in
* some cases but I'm hopeful it will for most.
*/
func main() {
// this would come from blober or whatever
input := make([]byte, 1024)
_, _ = os.Stdin.Read(input)
binary := IsBinary(string(input))
if binary {
fmt.Printf("binary: %v\n", binary)
} else {
detection := DetectEncoding(string(input))
fmt.Printf("encoding: %s\n", detection.Encoding)
fmt.Printf("confience: %d\n", detection.Confidence)
fmt.Printf("language: %s\n", detection.Language)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment