brianmario/detector.go

## detector.go
package main

/*
#cgo CFLAGS: -I/opt/boxen/homebrew/opt/icu4c/include
#cgo LDFLAGS: -L/opt/boxen/homebrew/opt/icu4c/lib -licui18n -licuuc
#include <stdlib.h>
#include <unicode/ucsdet.h>
*/
import "C"

import (
	"fmt"
	"unsafe"
	"os"
	"bytes"
	"strings"
)

type Detection struct {
	Encoding string
	Confidence int32
	Language string
}

/* You'll obviously want to handle this other than exit(n) lol
 *
 * We can also get an error string back from the code, but it's not
 * super helpful unfortunately. I think ideally we'd just have a map
 * of ICU error codes to Go errors so we could quickly return an error
 * or nil.
 */
func checkStatus(status C.UErrorCode) {
	if status > C.U_ZERO_ERROR {
		fmt.Printf("Error %q\n", status)
		os.Exit(int(status))
	}
}

type Bom struct {
	sig []byte
	name string
}

var validBoms = []*Bom{
	&Bom{[]byte("\xEF\xBB\xBF"), "UTF-8"},
	&Bom{[]byte("\xFE\xFF"), "UTF-16BE"},
	&Bom{[]byte("\xFF\xFE"), "UTF-16LE"},
	&Bom{[]byte("\x00\x00\xFE\xFF"), "UTF-32BE"},
	&Bom{[]byte("\xFF\xFE\x00\x00"), "UTF-32LE"},
}

/* Public: An extremely basic binary detector.
 * It's a little smarter than just checking for a null byte within
 * the first 1kb but not by much.
 *
 * Returns a bool. true if binary, false if not.
 */
func IsBinary(data string) bool {
	// First check and see if it's got a BOM
	for _, bom := range validBoms {
		if bytes.HasPrefix([]byte(data), bom.sig) {
			return false
		}
	}

	// Next, check and see if there are any null bytes
	if strings.ContainsRune(data[:1024], 0) {
		return true
	}

	// If no null byte was found, it's pretty unlikely that it's binary
	// but this is just a really dumb guess.
	return false
}

/* Public: Detect the encoding of a string of text.
 * This could maybe be refactored to run in it's own goroutine,
 * but encoding detection is REALLY, really fast. Especially on a
 * fixed size of data. And given the sizes we're talking about
 * (1k or 32k or whatever) the speed should be pretty consitent.
 *
 * Also we don't need to allocate the entire Detection struct
 * if all you really care about is the encoding name string.
 *
 * Returns a newly allocated Detection with it's Encoding, Confidence
 * and Language fields set.
 */
func DetectEncoding(data string) *Detection {
	var status C.UErrorCode = C.U_ZERO_ERROR
	var match *C.UCharsetMatch

	// create a new detector and schedule it's free
	detector := C.ucsdet_open(&status)
	checkStatus(status)
	defer C.ucsdet_close(detector)

	// This kinda sucks because it makes a copy of input for use in C
	// as a result we must free the string ourselves once we're done
	cStr := C.CString(data)
	defer C.free(unsafe.Pointer(cStr))
	cStrLen := C.int32_t(len(data))

	/**
	 * Set the input byte data whose charset is to detected.
	 *
	 * Ownership of the input  text byte array remains with the caller.
	 * The input string must not be altered or deleted until the charset
	 * detector is either closed or reset to refer to different input text.
	 */
	status = C.U_ZERO_ERROR
	C.ucsdet_setText(detector, cStr, cStrLen, &status)
	checkStatus(status)

	/** Set the declared encoding for charset detection.
	 * The declared encoding of an input text is an encoding obtained
	 * by the user from an http header or xml declaration or similar source that
	 * can be provided as an additional hint to the charset detector.
	 */
	// cEncHint := C.CString("UTF-8")
	// defer C.free(unsafe.Pointer(cEncHint))
	// C.ucsdet_setDeclaredEncoding(detector, cEncHint, 5, &status)

	/**
	 * Enable filtering of input text. If filtering is enabled,
	 * text within angle brackets ("<" and ">") will be removed
	 * before detection, which will remove most HTML or xml markup.
	 *
	 * the second parameter is a bool 1/0 for true/false
	 */
	// C.ucsdet_enableInputFilter(detector, 1)

	// perform the actual detection
	status = C.U_ZERO_ERROR
	match = C.ucsdet_detect(detector, &status)
	checkStatus(status)

	status = C.U_ZERO_ERROR
	enc := C.ucsdet_getName(match, &status)
	checkStatus(status)

	status = C.U_ZERO_ERROR
	lang := C.ucsdet_getLanguage(match, &status)
	checkStatus(status)

	detection := new(Detection)
	detection.Encoding = C.GoString(enc)
	detection.Confidence = int32(C.ucsdet_getConfidence(match, &status))
	detection.Language = C.GoString(lang)

	return detection
}

/* This basically replicates what charlock_holmes does.
 *   1. try and detect if the content is binary
 *   2. if not, try and detect it's text encoding
 *
 * The main difference is that the binary detection isn't using
 * libmagic here, like in charlock_holmes. That library is atrocious
 * and I want to stop using it in charlock_holmes anyway so the
 * binary detection code used here is what I was planning on doing
 * in charlock to replace libmagic. It won't match as accurately in
 * some cases but I'm hopeful it will for most.
 */
func main() {
	// this would come from blober or whatever
	input := make([]byte, 1024)
	_, _ = os.Stdin.Read(input)

	binary := IsBinary(string(input))
	if binary {
		fmt.Printf("binary: %v\n", binary)
	} else {
		detection := DetectEncoding(string(input))
		fmt.Printf("encoding: %s\n", detection.Encoding)
		fmt.Printf("confience: %d\n", detection.Confidence)
		fmt.Printf("language: %s\n", detection.Language)
	}
}
	package main

	/*
	#cgo CFLAGS: -I/opt/boxen/homebrew/opt/icu4c/include
	#cgo LDFLAGS: -L/opt/boxen/homebrew/opt/icu4c/lib -licui18n -licuuc
	#include <stdlib.h>
	#include <unicode/ucsdet.h>
	*/
	import "C"

	import (
	"fmt"
	"unsafe"
	"os"
	"bytes"
	"strings"
	)

	type Detection struct {
	Encoding string
	Confidence int32
	Language string
	}

	/* You'll obviously want to handle this other than exit(n) lol
	*
	* We can also get an error string back from the code, but it's not
	* super helpful unfortunately. I think ideally we'd just have a map
	* of ICU error codes to Go errors so we could quickly return an error
	* or nil.
	*/
	func checkStatus(status C.UErrorCode) {
	if status > C.U_ZERO_ERROR {
	fmt.Printf("Error %q\n", status)
	os.Exit(int(status))
	}
	}

	type Bom struct {
	sig []byte
	name string
	}

	var validBoms = []*Bom{
	&Bom{[]byte("\xEF\xBB\xBF"), "UTF-8"},
	&Bom{[]byte("\xFE\xFF"), "UTF-16BE"},
	&Bom{[]byte("\xFF\xFE"), "UTF-16LE"},
	&Bom{[]byte("\x00\x00\xFE\xFF"), "UTF-32BE"},
	&Bom{[]byte("\xFF\xFE\x00\x00"), "UTF-32LE"},
	}

	/* Public: An extremely basic binary detector.
	* It's a little smarter than just checking for a null byte within
	* the first 1kb but not by much.
	*
	* Returns a bool. true if binary, false if not.
	*/
	func IsBinary(data string) bool {
	// First check and see if it's got a BOM
	for _, bom := range validBoms {
	if bytes.HasPrefix([]byte(data), bom.sig) {
	return false
	}
	}

	// Next, check and see if there are any null bytes
	if strings.ContainsRune(data[:1024], 0) {
	return true
	}

	// If no null byte was found, it's pretty unlikely that it's binary
	// but this is just a really dumb guess.
	return false
	}

	/* Public: Detect the encoding of a string of text.
	* This could maybe be refactored to run in it's own goroutine,
	* but encoding detection is REALLY, really fast. Especially on a
	* fixed size of data. And given the sizes we're talking about
	* (1k or 32k or whatever) the speed should be pretty consitent.
	*
	* Also we don't need to allocate the entire Detection struct
	* if all you really care about is the encoding name string.
	*
	* Returns a newly allocated Detection with it's Encoding, Confidence
	* and Language fields set.
	*/
	func DetectEncoding(data string) *Detection {
	var status C.UErrorCode = C.U_ZERO_ERROR
	var match *C.UCharsetMatch

	// create a new detector and schedule it's free
	detector := C.ucsdet_open(&status)
	checkStatus(status)
	defer C.ucsdet_close(detector)

	// This kinda sucks because it makes a copy of input for use in C
	// as a result we must free the string ourselves once we're done
	cStr := C.CString(data)
	defer C.free(unsafe.Pointer(cStr))
	cStrLen := C.int32_t(len(data))

	/**
	* Set the input byte data whose charset is to detected.
	*
	* Ownership of the input text byte array remains with the caller.
	* The input string must not be altered or deleted until the charset
	* detector is either closed or reset to refer to different input text.
	*/
	status = C.U_ZERO_ERROR
	C.ucsdet_setText(detector, cStr, cStrLen, &status)
	checkStatus(status)

	/** Set the declared encoding for charset detection.
	* The declared encoding of an input text is an encoding obtained
	* by the user from an http header or xml declaration or similar source that
	* can be provided as an additional hint to the charset detector.
	*/
	// cEncHint := C.CString("UTF-8")
	// defer C.free(unsafe.Pointer(cEncHint))
	// C.ucsdet_setDeclaredEncoding(detector, cEncHint, 5, &status)

	/**
	* Enable filtering of input text. If filtering is enabled,
	* text within angle brackets ("<" and ">") will be removed
	* before detection, which will remove most HTML or xml markup.
	*
	* the second parameter is a bool 1/0 for true/false
	*/
	// C.ucsdet_enableInputFilter(detector, 1)

	// perform the actual detection
	status = C.U_ZERO_ERROR
	match = C.ucsdet_detect(detector, &status)
	checkStatus(status)

	status = C.U_ZERO_ERROR
	enc := C.ucsdet_getName(match, &status)
	checkStatus(status)

	status = C.U_ZERO_ERROR
	lang := C.ucsdet_getLanguage(match, &status)
	checkStatus(status)

	detection := new(Detection)
	detection.Encoding = C.GoString(enc)
	detection.Confidence = int32(C.ucsdet_getConfidence(match, &status))
	detection.Language = C.GoString(lang)

	return detection
	}

	/* This basically replicates what charlock_holmes does.
	* 1. try and detect if the content is binary
	* 2. if not, try and detect it's text encoding
	*
	* The main difference is that the binary detection isn't using
	* libmagic here, like in charlock_holmes. That library is atrocious
	* and I want to stop using it in charlock_holmes anyway so the
	* binary detection code used here is what I was planning on doing
	* in charlock to replace libmagic. It won't match as accurately in
	* some cases but I'm hopeful it will for most.
	*/
	func main() {
	// this would come from blober or whatever
	input := make([]byte, 1024)
	_, _ = os.Stdin.Read(input)

	binary := IsBinary(string(input))
	if binary {
	fmt.Printf("binary: %v\n", binary)
	} else {
	detection := DetectEncoding(string(input))
	fmt.Printf("encoding: %s\n", detection.Encoding)
	fmt.Printf("confience: %d\n", detection.Confidence)
	fmt.Printf("language: %s\n", detection.Language)
	}
	}