alaingilbert/main.go

## main.go
package main

import (
	"bytes"
	"os"
	"os/exec"
	"regexp"
	"strconv"
)

func main() {
	_ = getContent()
	by, _ := os.ReadFile("BS_22_23_Content_page_1.txt")
	processPdfRaw(by)
}

func processPdfRaw(in []byte) {
	// Remove all new lines
	in = bytes.Join(bytes.Split(in, []byte{'\n'}), []byte{' '})

	// Replace all hex values with their corresponding text
	rgx := regexp.MustCompile(`<(\w+)>`)
	in = rgx.ReplaceAllFunc(in, func(b []byte) []byte {
		return []byte("<" + hexToTxt(string(b[1:len(b)-1])) + ">")
	})

	in = bytes.ReplaceAll(in, []byte(" Q q "), []byte("\nQ q\n"))

	// Put all BT (Begin Text) on their own line
	in = bytes.ReplaceAll(in, []byte(" ET "), []byte(" ET\n"))
	in = bytes.ReplaceAll(in, []byte(" BT"), []byte("\nBT"))

	in = bytes.ReplaceAll(in, []byte(" f "), []byte(" f\n"))

	// Save to file
	_ = os.WriteFile("test.txt", in, 0644)
}

func hexToTxt(h string) (out string) {
	for i := 0; i < len(h); i += 4 {
		n, _ := strconv.ParseInt(h[i:i+4], 16, 64)
		out += string(rune(n + 29))
	}
	return
}

func getContent() error {
	cmd := exec.Command("pdfcpu", "extract", "-mode", "content", "BS_22_23.pdf", ".")
	_, err := cmd.Output()
	return err
}
	package main

	import (
	"bytes"
	"os"
	"os/exec"
	"regexp"
	"strconv"
	)

	func main() {
	_ = getContent()
	by, _ := os.ReadFile("BS_22_23_Content_page_1.txt")
	processPdfRaw(by)
	}

	func processPdfRaw(in []byte) {
	// Remove all new lines
	in = bytes.Join(bytes.Split(in, []byte{'\n'}), []byte{' '})

	// Replace all hex values with their corresponding text
	rgx := regexp.MustCompile(`<(\w+)>`)
	in = rgx.ReplaceAllFunc(in, func(b []byte) []byte {
	return []byte("<" + hexToTxt(string(b[1:len(b)-1])) + ">")
	})

	in = bytes.ReplaceAll(in, []byte(" Q q "), []byte("\nQ q\n"))

	// Put all BT (Begin Text) on their own line
	in = bytes.ReplaceAll(in, []byte(" ET "), []byte(" ET\n"))
	in = bytes.ReplaceAll(in, []byte(" BT"), []byte("\nBT"))

	in = bytes.ReplaceAll(in, []byte(" f "), []byte(" f\n"))

	// Save to file
	_ = os.WriteFile("test.txt", in, 0644)
	}

	func hexToTxt(h string) (out string) {
	for i := 0; i < len(h); i += 4 {
	n, _ := strconv.ParseInt(h[i:i+4], 16, 64)
	out += string(rune(n + 29))
	}
	return
	}

	func getContent() error {
	cmd := exec.Command("pdfcpu", "extract", "-mode", "content", "BS_22_23.pdf", ".")
	_, err := cmd.Output()
	return err
	}