Skip to content

Instantly share code, notes, and snippets.

@jvymazal
Created January 17, 2021 10:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jvymazal/c2a3b04c1b2f3adb2d60293c1b0d83c1 to your computer and use it in GitHub Desktop.
Save jvymazal/c2a3b04c1b2f3adb2d60293c1b0d83c1 to your computer and use it in GitHub Desktop.
Sample app to parse file using GroupDocs.Parser API
using System;
using GroupDocs.Parser;
using GroupDocs.Parser.Options;
using System.Xml;
using System.Linq;
public class parserPDF
{
public static void Parse(string target)
{
Parser parser = new Parser(target);
Features features = parser.Features;
IDocumentInfo info = parser.GetDocumentInfo();
Console.WriteLine(string.Format("FileType: {0}", info.FileType));
Console.WriteLine(string.Format("PageCount: {0}", info.PageCount));
Console.WriteLine(string.Format("Size: {0}", info.Size));
Console.WriteLine(" ----");
if (features.Text)
{
Console.WriteLine("Text:");
Console.WriteLine(parser.GetText().ReadToEnd());
}
if (features.Container)
{
Console.WriteLine("Container (" + parser.GetContainer().Count() + "):");
foreach (GroupDocs.Parser.Data.ContainerItem item in parser.GetContainer())
{
Console.WriteLine(" Name: " + item.Name);
Console.WriteLine(" Size: " + item.Size);
Console.WriteLine(" ?: " + item.ToString());
}
}
if (features.Toc)
{
Console.WriteLine("Toc (" + parser.GetToc().Count() + "):");
foreach (GroupDocs.Parser.Data.TocItem tocItem in parser.GetToc())
{
Console.WriteLine(tocItem.ExtractText().ReadToEnd());
}
}
if (features.Images)
{
Console.WriteLine("Images (" + parser.GetImages().Count() + "):");
int i = 0;
foreach (GroupDocs.Parser.Data.PageImageArea image in parser.GetImages())
{
i++;
image.Save(target + "_extracted-" + i + "." + image.FileType.Extension);
}
}
if (features.Metadata)
{
Console.WriteLine("Metadata (" + parser.GetMetadata().Count() + "):");
foreach (GroupDocs.Parser.Data.MetadataItem metaItem in parser.GetMetadata())
{
Console.WriteLine("Name: " + metaItem.Name + " || Value: " + metaItem.Value);
}
}
if (features.Structure)
{
Console.WriteLine("Structure:");
XmlReader reader = parser.GetStructure();
while (reader.Read())
{
switch (reader.NodeType)
{
case XmlNodeType.Element:
Console.WriteLine("Start Element {0}", reader.Name);
break;
case XmlNodeType.Text:
Console.WriteLine("Text Node: {0}", reader.Value);
break;
case XmlNodeType.EndElement:
Console.WriteLine("End Element {0}", reader.Name);
break;
default:
Console.WriteLine("Other node {0} with value {1}",
reader.NodeType, reader.Value);
break;
}
}
}
if (features.TextAreas)
{
Console.WriteLine("TextAreas (" + parser.GetTextAreas().Count() + "):");
foreach (GroupDocs.Parser.Data.PageTextArea page in parser.GetTextAreas())
{
Console.WriteLine(page.Text);
}
}
if (features.TextPage)
Console.WriteLine("TextPage");
if (features.Search)
Console.WriteLine("Search");
if (features.Tables)
Console.WriteLine("Tables");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment