Skip to content

Instantly share code, notes, and snippets.

@groupdocs-cloud-gists
Last active April 26, 2021 06:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save groupdocs-cloud-gists/7c4c7349796d5131b0d6d1585a7bcb8a to your computer and use it in GitHub Desktop.
Save groupdocs-cloud-gists/7c4c7349796d5131b0d6d1585a7bcb8a to your computer and use it in GitHub Desktop.
Parse Documents by Template and Extract Data using a REST API in Python.
Parse Documents by Template and Extract Data
1. Programmatically upload a PDF file on the cloud
2. Parse Documents by Template Object using a REST API in Python
3. Parse Documents by Template File using a REST API in Python
client_id = "da0c487d-c1c0-45ae-b7bf-43eaf53c5ad5"
client_secret = "479db2b01dcb93a3d4d20efb16dea971"
configuration = groupdocs_parser_cloud.Configuration(client_id, client_secret)
configuration.api_base_url = "https://api.groupdocs.cloud"
my_storage = ""
# api initialization
parseApi = groupdocs_parser_cloud.ParseApi.from_config(configuration)
# define parse options
options = groupdocs_parser_cloud.ParseOptions()
options.file_info = groupdocs_parser_cloud.FileInfo()
options.file_info.file_path = "companies.pdf"
options.template_path = "template.json"
# parse request
request = groupdocs_parser_cloud.ParseRequest(options)
result = parseApi.parse(request)
# show results
for data in result.fields_data:
if data.page_area.page_text_area is not None:
print("Field name: " + data.name + ". Text :" + data.page_area.page_text_area.text)
if data.page_area.page_table_area is not None:
print("Table name: " + data.name)
for cell in data.page_area.page_table_area.page_table_area_cells:
print("Table cell. Row " + str(cell.row_index) + " column " + str(cell.column_index) + ". Text: " + cell.page_area.page_text_area.text);
# api initialization
parseApi = groupdocs_parser_cloud.ParseApi.from_config(configuration)
# define parse options
options = groupdocs_parser_cloud.ParseOptions()
options.file_info = groupdocs_parser_cloud.FileInfo()
options.file_info.file_path = "companies.pdf"
# Get Template Object
options.template = GetTemplate()
# parse request
request = groupdocs_parser_cloud.ParseRequest(options)
result = parseApi.parse(request)
# Show Results
for data in result.fields_data:
if data.page_area.page_text_area is not None:
print("Field name: " + data.name + ". Text :" + data.page_area.page_text_area.text)
if data.page_area.page_table_area is not None:
print("Table name: " + data.name)
for cell in data.page_area.page_table_area.page_table_area_cells:
print("Table cell. Row " + str(cell.row_index) + " column " + str(cell.column_index) + ". Text: " + cell.page_area.page_text_area.text);
{
"Fields": [
{
"FieldName": "Address",
"FieldPosition": {
"FieldPositionType": "Regex",
"Regex": "Companyaddress:"
}
},
{
"FieldName": "CompanyAddress",
"FieldPosition": {
"FieldPositionType": "Linked",
"LinkedFieldName": "ADDRESS",
"IsRightLinked": true,
"SearchArea": {
"Height": 10.0,
"Width": 100.0
},
"AutoScale": true
}
},
{
"FieldName": "Company",
"FieldPosition": {
"FieldPositionType": "Regex",
"Regex": "Companyname:"
}
},
{
"FieldName": "CompanyName",
"FieldPosition": {
"FieldPositionType": "Linked",
"LinkedFieldName": "Company",
"IsRightLinked": true,
"SearchArea": {
"Height": 10.0,
"Width": 100.0
},
"AutoScale": true
}
}
],
"Tables": [
{
"TableName": "Companies",
"DetectorParameters": {
"Rectangle": {
"Position": {
"X": 77.0,
"Y": 279.0
},
"Size": {
"Height": 41.0,
"Width": 480.0
}
}
}
}
]
}
def GetTemplate():
field1 = groupdocs_parser_cloud.Field()
field1.field_name = "Address"
fieldPosition1 = groupdocs_parser_cloud.FieldPosition()
fieldPosition1.field_position_type = "Regex"
fieldPosition1.regex = "Company address:"
field1.field_position = fieldPosition1
field2 = groupdocs_parser_cloud.Field()
field2.field_name = "CompanyAddress"
fieldPosition2 = groupdocs_parser_cloud.FieldPosition()
fieldPosition2.field_position_type = "Linked"
fieldPosition2.linked_field_name = "ADDRESS"
fieldPosition2.is_right_linked = True
size2 = groupdocs_parser_cloud.Size()
size2.width = 100
size2.height = 10
fieldPosition2.search_area = size2
fieldPosition2.auto_scale = True
field2.field_position = fieldPosition2
field3 = groupdocs_parser_cloud.Field()
field3.field_name = "Company"
fieldPosition3 = groupdocs_parser_cloud.FieldPosition()
fieldPosition3.field_position_type = "Regex"
fieldPosition3.regex = "Company name:"
field3.field_position = fieldPosition3
field4 = groupdocs_parser_cloud.Field()
field4.field_name = "CompanyName"
fieldPosition4 = groupdocs_parser_cloud.FieldPosition()
fieldPosition4.field_position_type = "Linked"
fieldPosition4.linked_field_name = "Company"
fieldPosition4.is_right_linked = True
size4 = groupdocs_parser_cloud.Size()
size4.width = 100
size4.height = 10
fieldPosition4.search_area = size4
fieldPosition4.auto_scale = True
field4.field_position = fieldPosition4
table = groupdocs_parser_cloud.Table()
table.table_name = "Companies"
detectorparams = groupdocs_parser_cloud.DetectorParameters()
rect = groupdocs_parser_cloud.Rectangle()
size = groupdocs_parser_cloud.Size()
size.height = 60
size.width = 480
position = groupdocs_parser_cloud.Point()
position.x = 77
position.y = 279
rect.size = size
rect.position = position
detectorparams.rectangle = rect
table.detector_parameters = detectorparams
fields = [field1, field2, field3, field4]
tables = [table]
template = groupdocs_parser_cloud.Template()
template.fields = fields
template.tables = tables
return template
# Create instance of the API
file_api = groupdocs_parser_cloud.FileApi.from_config(configuration)
# upload sample files
request = groupdocs_parser_cloud.UploadFileRequest("companies.pdf", "C:\\Files\\companies.pdf", my_storage)
response = file_api.upload_file(request)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment