ifraixedes/golang_html_unescape.go

## golang_html_unescape.go
package main

// HTMLUnescape replace from src the sequence of bytes \u003c, \u003e, \u0026,
// \u2028, \u2029 by their representative characters <, >, &, U+2028, U+2029.
// Mostly it's does the reverse result of the HTMLEscape function present in
// encoding/json package of the standard library.
//
// The function receives a byte slice for avoiding string allocations during the
// replacement besides it modifies src and returns it.
func HTMLUnescape(src []byte) []byte {
	// We don't need to iterate until the end of the src once we have checked the
	// 6th character before the end because if it didn't match inside of loop then
	// we won't find any escaped character.
	for i := 0; i < (len(src) - 5); {
		if src[i] != '\\' {
			i++
			continue
		}
		// if there aren't more than 5 characters ahead it isn't possible to find
		// any of the escaped characters.
		if (i + 5) >= len(src) {
			break
		}

		if src[i+1] != 'u' {
			i += 2
			continue
		}

		var c byte
		if c = src[i+2]; c != '0' && c != '2' {
			i += 3
			continue
		}

		if c == '2' {
			switch {
			case src[i+3] != '0':
				i += 3
				continue
			case src[i+4] != '2':
				i += 4
				continue
			case src[i+5] != '8' && src[i+5] != '9':
				i += 5
				continue
			}

			// This is \u2028 or \u2029 which correspond to U+2028 and U+2029 (E2 80
			// A8 and E2 80 A9)
			c = src[i+5]
			src = append(src[:i+3], src[i+6:]...)
			src[i] = 0xE2
			src[i+1] = 0x80
			if c == '8' {
				src[i+2] = 0xA8
			} else {
				src[i+2] = 0xA9
			}

			i += 3
			continue
		}

		switch {
		case src[i+3] != '0':
			i += 3
			continue
		case src[i+4] != '3' && src[i+4] != '2':
			i += 5
			continue
		}

		if src[i+4] == '2' {
			if src[i+5] != '6' {
				i += 6
				continue
			}
			// This is \u0026 which corresponds to '&'
			src = append(src[:i+1], src[i+6:]...)
			src[i] = '&'
			i++
			continue
		}

		if c = src[i+5]; c != 'c' && c != 'e' {
			i += 6
			continue
		}

		// This is \u003c or \u003e which corresponds to '<' and '>'
		src = append(src[:i+1], src[i+6:]...)
		if c == 'c' {
			src[i] = '<'
		} else {
			src[i] = '>'
		}

		i++
	}

	return src
}

## html_unescape_test.go
package main

import (
	"fmt"
	"testing"
)

func TestHTMLUnescape(t *testing.T) {
	tcases := []struct {
		src string
		exp string
	}{
		{
			src: `1 is \u003c than 5`,
			exp: "1 is < than 5",
		},
		{
			src: `10 is \u003e than 5`,
			exp: "10 is > than 5",
		},
		{
			src: `black \u0026 white`,
			exp: "black & white",
		},
		{
			src: `-- \u2028 --`,
			exp: fmt.Sprintf("-- %s --", string([]byte{0xE2, 0x80, 0xA8})),
		},
		{
			src: `Hey: \u2029`,
			exp: fmt.Sprintf("Hey: %s", string([]byte{0xE2, 0x80, 0xA9})),
		},
		{
			src: "no escaped characters keep the slice of bytes as it's",
			exp: "no escaped characters keep the slice of bytes as it's",
		},
		{
			src: `\u003c\u003e\u0026\u2028\u2029`,
			exp: fmt.Sprintf("<>&%s%s", string([]byte{0xE2, 0x80, 0xA8}), string([]byte{0xE2, 0x80, 0xA9})),
		},
	}

	for i, tc := range tcases {
		result := HTMLUnescape([]byte(tc.src))
		if tc.exp != string(result) {
			t.Errorf("unexpected result for test case %d; want=%q, got=%q", i, tc.exp, string(result))
		}
	}
}
	package main

	// HTMLUnescape replace from src the sequence of bytes \u003c, \u003e, \u0026,
	// \u2028, \u2029 by their representative characters <, >, &, U+2028, U+2029.
	// Mostly it's does the reverse result of the HTMLEscape function present in
	// encoding/json package of the standard library.
	//
	// The function receives a byte slice for avoiding string allocations during the
	// replacement besides it modifies src and returns it.
	func HTMLUnescape(src []byte) []byte {
	// We don't need to iterate until the end of the src once we have checked the
	// 6th character before the end because if it didn't match inside of loop then
	// we won't find any escaped character.
	for i := 0; i < (len(src) - 5); {
	if src[i] != '\\' {
	i++
	continue
	}
	// if there aren't more than 5 characters ahead it isn't possible to find
	// any of the escaped characters.
	if (i + 5) >= len(src) {
	break
	}

	if src[i+1] != 'u' {
	i += 2
	continue
	}

	var c byte
	if c = src[i+2]; c != '0' && c != '2' {
	i += 3
	continue
	}

	if c == '2' {
	switch {
	case src[i+3] != '0':
	i += 3
	continue
	case src[i+4] != '2':
	i += 4
	continue
	case src[i+5] != '8' && src[i+5] != '9':
	i += 5
	continue
	}

	// This is \u2028 or \u2029 which correspond to U+2028 and U+2029 (E2 80
	// A8 and E2 80 A9)
	c = src[i+5]
	src = append(src[:i+3], src[i+6:]...)
	src[i] = 0xE2
	src[i+1] = 0x80
	if c == '8' {
	src[i+2] = 0xA8
	} else {
	src[i+2] = 0xA9
	}

	i += 3
	continue
	}

	switch {
	case src[i+3] != '0':
	i += 3
	continue
	case src[i+4] != '3' && src[i+4] != '2':
	i += 5
	continue
	}

	if src[i+4] == '2' {
	if src[i+5] != '6' {
	i += 6
	continue
	}
	// This is \u0026 which corresponds to '&'
	src = append(src[:i+1], src[i+6:]...)
	src[i] = '&'
	i++
	continue
	}

	if c = src[i+5]; c != 'c' && c != 'e' {
	i += 6
	continue
	}

	// This is \u003c or \u003e which corresponds to '<' and '>'
	src = append(src[:i+1], src[i+6:]...)
	if c == 'c' {
	src[i] = '<'
	} else {
	src[i] = '>'
	}

	i++
	}

	return src
	}
	package main

	import (
	"fmt"
	"testing"
	)

	func TestHTMLUnescape(t *testing.T) {
	tcases := []struct {
	src string
	exp string
	}{
	{
	src: `1 is \u003c than 5`,
	exp: "1 is < than 5",
	},
	{
	src: `10 is \u003e than 5`,
	exp: "10 is > than 5",
	},
	{
	src: `black \u0026 white`,
	exp: "black & white",
	},
	{
	src: `-- \u2028 --`,
	exp: fmt.Sprintf("-- %s --", string([]byte{0xE2, 0x80, 0xA8})),
	},
	{
	src: `Hey: \u2029`,
	exp: fmt.Sprintf("Hey: %s", string([]byte{0xE2, 0x80, 0xA9})),
	},
	{
	src: "no escaped characters keep the slice of bytes as it's",
	exp: "no escaped characters keep the slice of bytes as it's",
	},
	{
	src: `\u003c\u003e\u0026\u2028\u2029`,
	exp: fmt.Sprintf("<>&%s%s", string([]byte{0xE2, 0x80, 0xA8}), string([]byte{0xE2, 0x80, 0xA9})),
	},
	}

	for i, tc := range tcases {
	result := HTMLUnescape([]byte(tc.src))
	if tc.exp != string(result) {
	t.Errorf("unexpected result for test case %d; want=%q, got=%q", i, tc.exp, string(result))
	}
	}
	}