-
-
Save giggiu16/a17fa87ed1ca0dbb3a22b670a623c18c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"os" | |
unicommon "github.com/unidoc/unidoc/common" | |
pdfcore "github.com/unidoc/unidoc/pdf/core" | |
"github.com/unidoc/unidoc/pdf/creator" | |
"github.com/unidoc/unidoc/pdf/extractor" | |
pdf "github.com/unidoc/unidoc/pdf/model" | |
pdfmodel "github.com/unidoc/unidoc/pdf/model" | |
) | |
func main() { | |
unicommon.SetLogger(unicommon.NewConsoleLogger(unicommon.LogLevelDebug)) | |
if len(os.Args) < 2 { | |
fmt.Printf("Syntax: go run main.go input.pdf\n") | |
os.Exit(1) | |
} | |
inputPath := os.Args[1] | |
fmt.Printf("Input file: %s\n", inputPath) | |
err := extractText(inputPath) | |
if err != nil { | |
fmt.Printf("Error: %v\n", err) | |
os.Exit(1) | |
} | |
} | |
func extractText(inputPath string) error { | |
f, err := os.Open(inputPath) | |
if err != nil { | |
return err | |
} | |
defer f.Close() | |
pdfReader, err := pdf.NewPdfReader(f) | |
if err != nil { | |
return err | |
} | |
numPages, err := pdfReader.GetNumPages() | |
if err != nil { | |
return err | |
} else if numPages < 1 { | |
return nil | |
} | |
page, err := pdfReader.GetPage(1) | |
if err != nil { | |
return err | |
} | |
e, err := extractor.New(page) | |
if err != nil { | |
return err | |
} | |
creator := creator.New() | |
newpage := pdf.NewPdfPage() | |
newpage.MediaBox = page.MediaBox | |
creator.AddPage(newpage) | |
texts, _, _, err := e.ExtractPageText() | |
if err != nil { | |
return err | |
} | |
comps := texts.TextComponents() | |
count := 0 | |
for _, m := range comps { | |
// Only parse first 1000 characters for brevity | |
if count > 1000 { | |
continue | |
} | |
count++ | |
p := creator.NewParagraph(m.Text) | |
// Add font to page | |
if newpage.HasFontByName(pdfcore.PdfObjectName(m.Font.BaseFont())) == false { | |
fmt.Printf("%s font not found in page, adding it\n", pdfcore.PdfObjectName(m.Font.BaseFont())) | |
newpage.AddFont(pdfcore.PdfObjectName(m.Font.BaseFont()), m.Font.ToPdfObject()) | |
} | |
font, found := newpage.Resources.GetFontByName(pdfcore.PdfObjectName(m.Font.BaseFont())) | |
if found { | |
newFont, err := pdfmodel.NewPdfFontFromPdfObject(font) | |
if err != nil { | |
fmt.Println("error is: ", err) | |
} else { | |
p.SetFont(newFont) | |
fmt.Println("font set:", newFont.BaseFont()) | |
} | |
} | |
yPos := page.MediaBox.Height() - (m.Y + m.Height) | |
p.SetPos(m.X, yPos) | |
// p.SetFontSize(m.Font) | |
// p.SetWidth(m.Width) | |
// p.SetFont(m.Font) | |
// p.SetLineHeight(m.Height) | |
creator.Draw(p) | |
} | |
err = creator.WriteToFile("new.pdf") | |
if err != nil { | |
return err | |
} | |
return nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment