Skip to content

Instantly share code, notes, and snippets.

@alaingilbert
Created January 31, 2024 21:23
Show Gist options
  • Save alaingilbert/d9ee32efb41ffc9789ddd4ef0fb73435 to your computer and use it in GitHub Desktop.
Save alaingilbert/d9ee32efb41ffc9789ddd4ef0fb73435 to your computer and use it in GitHub Desktop.
quick hack to extract and decode text out of a pdf
package main
import (
"bytes"
"os"
"os/exec"
"regexp"
"strconv"
)
func main() {
_ = getContent()
by, _ := os.ReadFile("BS_22_23_Content_page_1.txt")
processPdfRaw(by)
}
func processPdfRaw(in []byte) {
// Remove all new lines
in = bytes.Join(bytes.Split(in, []byte{'\n'}), []byte{' '})
// Replace all hex values with their corresponding text
rgx := regexp.MustCompile(`<(\w+)>`)
in = rgx.ReplaceAllFunc(in, func(b []byte) []byte {
return []byte("<" + hexToTxt(string(b[1:len(b)-1])) + ">")
})
in = bytes.ReplaceAll(in, []byte(" Q q "), []byte("\nQ q\n"))
// Put all BT (Begin Text) on their own line
in = bytes.ReplaceAll(in, []byte(" ET "), []byte(" ET\n"))
in = bytes.ReplaceAll(in, []byte(" BT"), []byte("\nBT"))
in = bytes.ReplaceAll(in, []byte(" f "), []byte(" f\n"))
// Save to file
_ = os.WriteFile("test.txt", in, 0644)
}
func hexToTxt(h string) (out string) {
for i := 0; i < len(h); i += 4 {
n, _ := strconv.ParseInt(h[i:i+4], 16, 64)
out += string(rune(n + 29))
}
return
}
func getContent() error {
cmd := exec.Command("pdfcpu", "extract", "-mode", "content", "BS_22_23.pdf", ".")
_, err := cmd.Output()
return err
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment