Last active
July 27, 2021 16:02
-
-
Save aspose-com-gists/0fbf54fdf5406ad739b76e02e3a2f00e to your computer and use it in GitHub Desktop.
Extract Data from Tables in PDF using C++ | https://blog.aspose.com/2021/07/14/extract-data-from-tables-in-pdf-files-using-cpp/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Read the complete article about extracting data from tables in PDF files using C++ by visiting the following link. | |
https://blog.aspose.com/2021/07/14/extract-data-from-tables-in-pdf-files-using-cpp/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Load the PDF document | |
auto pdfDocument = MakeObject<Document>(u"SourceDirectory\\PDF\\Table_input3.pdf"); | |
// Iterate through the pages of the document | |
for (auto page : pdfDocument->get_Pages()) | |
{ | |
// Create an instance of the TableAbsorber class | |
auto absorber = MakeObject<TableAbsorber>(); | |
absorber->Visit(page); | |
// Iterate through the tables | |
for (auto table : absorber->get_TableList()) | |
{ | |
Console::WriteLine(u"Table"); | |
// Iterate through the rows | |
for (auto row : table->get_RowList()) | |
{ | |
// Iterate throught the cells | |
for (auto cell : row->get_CellList()) | |
{ | |
// Iterate throught the text fragments | |
for (auto fragment : cell->get_TextFragments()) | |
{ | |
String string = u""; | |
// Iterate through the text segments | |
for (auto seg : fragment->get_Segments()) | |
{ | |
// Get the text | |
string = String::Concat(string, seg->get_Text()); | |
} | |
// Print the text | |
Console::WriteLine(string); | |
} | |
} | |
Console::WriteLine(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Load the PDF document | |
auto pdfDocument = MakeObject<Document>(u"SourceDirectory\\PDF\\Table_input4.pdf"); | |
// Get the first page of the document | |
auto page = pdfDocument->get_Pages()->idx_get(1); | |
// Iterate through the annotations on the page | |
for (auto annotation : page->get_Annotations()) | |
{ | |
// Check the annotation type | |
if (annotation->get_AnnotationType() == Annotations::AnnotationType::Square) | |
{ | |
System::SharedPtr<SquareAnnotation> squareAnnotation = DynamicCast<SquareAnnotation>(annotation); | |
// Create an instance of the TableAbsorber class | |
auto absorber = MakeObject<TableAbsorber>(); | |
absorber->Visit(page); | |
// Iterate through the tables | |
for (auto table : absorber->get_TableList()) | |
{ | |
// Check if the table is in the region | |
if ((squareAnnotation->get_Rect()->get_LLX() < table->get_Rectangle()->get_LLX()) && | |
(squareAnnotation->get_Rect()->get_LLY() < table->get_Rectangle()->get_LLY()) && | |
(squareAnnotation->get_Rect()->get_URX() > table->get_Rectangle()->get_URX()) && | |
(squareAnnotation->get_Rect()->get_URY() > table->get_Rectangle()->get_URY()) | |
) | |
{ | |
// Iterate through the rows | |
for (auto row : table->get_RowList()) | |
{ | |
// Iterate throught the cells | |
for (auto cell : row->get_CellList()) | |
{ | |
// Iterate throught the text fragments | |
for (auto fragment : cell->get_TextFragments()) | |
{ | |
String string = u""; | |
// Iterate through the text segments | |
for (auto seg : fragment->get_Segments()) | |
{ | |
// Get the text | |
string = String::Concat(string, seg->get_Text()); | |
} | |
// Print the text | |
Console::WriteLine(string); | |
} | |
} | |
Console::WriteLine(); | |
} | |
} | |
} | |
break; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment