Last active
April 26, 2021 06:11
-
-
Save groupdocs-cloud-gists/7c4c7349796d5131b0d6d1585a7bcb8a to your computer and use it in GitHub Desktop.
Parse Documents by Template and Extract Data using a REST API in Python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Parse Documents by Template and Extract Data | |
1. Programmatically upload a PDF file on the cloud | |
2. Parse Documents by Template Object using a REST API in Python | |
3. Parse Documents by Template File using a REST API in Python |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
client_id = "da0c487d-c1c0-45ae-b7bf-43eaf53c5ad5" | |
client_secret = "479db2b01dcb93a3d4d20efb16dea971" | |
configuration = groupdocs_parser_cloud.Configuration(client_id, client_secret) | |
configuration.api_base_url = "https://api.groupdocs.cloud" | |
my_storage = "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# api initialization | |
parseApi = groupdocs_parser_cloud.ParseApi.from_config(configuration) | |
# define parse options | |
options = groupdocs_parser_cloud.ParseOptions() | |
options.file_info = groupdocs_parser_cloud.FileInfo() | |
options.file_info.file_path = "companies.pdf" | |
options.template_path = "template.json" | |
# parse request | |
request = groupdocs_parser_cloud.ParseRequest(options) | |
result = parseApi.parse(request) | |
# show results | |
for data in result.fields_data: | |
if data.page_area.page_text_area is not None: | |
print("Field name: " + data.name + ". Text :" + data.page_area.page_text_area.text) | |
if data.page_area.page_table_area is not None: | |
print("Table name: " + data.name) | |
for cell in data.page_area.page_table_area.page_table_area_cells: | |
print("Table cell. Row " + str(cell.row_index) + " column " + str(cell.column_index) + ". Text: " + cell.page_area.page_text_area.text); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# api initialization | |
parseApi = groupdocs_parser_cloud.ParseApi.from_config(configuration) | |
# define parse options | |
options = groupdocs_parser_cloud.ParseOptions() | |
options.file_info = groupdocs_parser_cloud.FileInfo() | |
options.file_info.file_path = "companies.pdf" | |
# Get Template Object | |
options.template = GetTemplate() | |
# parse request | |
request = groupdocs_parser_cloud.ParseRequest(options) | |
result = parseApi.parse(request) | |
# Show Results | |
for data in result.fields_data: | |
if data.page_area.page_text_area is not None: | |
print("Field name: " + data.name + ". Text :" + data.page_area.page_text_area.text) | |
if data.page_area.page_table_area is not None: | |
print("Table name: " + data.name) | |
for cell in data.page_area.page_table_area.page_table_area_cells: | |
print("Table cell. Row " + str(cell.row_index) + " column " + str(cell.column_index) + ". Text: " + cell.page_area.page_text_area.text); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"Fields": [ | |
{ | |
"FieldName": "Address", | |
"FieldPosition": { | |
"FieldPositionType": "Regex", | |
"Regex": "Companyaddress:" | |
} | |
}, | |
{ | |
"FieldName": "CompanyAddress", | |
"FieldPosition": { | |
"FieldPositionType": "Linked", | |
"LinkedFieldName": "ADDRESS", | |
"IsRightLinked": true, | |
"SearchArea": { | |
"Height": 10.0, | |
"Width": 100.0 | |
}, | |
"AutoScale": true | |
} | |
}, | |
{ | |
"FieldName": "Company", | |
"FieldPosition": { | |
"FieldPositionType": "Regex", | |
"Regex": "Companyname:" | |
} | |
}, | |
{ | |
"FieldName": "CompanyName", | |
"FieldPosition": { | |
"FieldPositionType": "Linked", | |
"LinkedFieldName": "Company", | |
"IsRightLinked": true, | |
"SearchArea": { | |
"Height": 10.0, | |
"Width": 100.0 | |
}, | |
"AutoScale": true | |
} | |
} | |
], | |
"Tables": [ | |
{ | |
"TableName": "Companies", | |
"DetectorParameters": { | |
"Rectangle": { | |
"Position": { | |
"X": 77.0, | |
"Y": 279.0 | |
}, | |
"Size": { | |
"Height": 41.0, | |
"Width": 480.0 | |
} | |
} | |
} | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def GetTemplate(): | |
field1 = groupdocs_parser_cloud.Field() | |
field1.field_name = "Address" | |
fieldPosition1 = groupdocs_parser_cloud.FieldPosition() | |
fieldPosition1.field_position_type = "Regex" | |
fieldPosition1.regex = "Company address:" | |
field1.field_position = fieldPosition1 | |
field2 = groupdocs_parser_cloud.Field() | |
field2.field_name = "CompanyAddress" | |
fieldPosition2 = groupdocs_parser_cloud.FieldPosition() | |
fieldPosition2.field_position_type = "Linked" | |
fieldPosition2.linked_field_name = "ADDRESS" | |
fieldPosition2.is_right_linked = True | |
size2 = groupdocs_parser_cloud.Size() | |
size2.width = 100 | |
size2.height = 10 | |
fieldPosition2.search_area = size2 | |
fieldPosition2.auto_scale = True | |
field2.field_position = fieldPosition2 | |
field3 = groupdocs_parser_cloud.Field() | |
field3.field_name = "Company" | |
fieldPosition3 = groupdocs_parser_cloud.FieldPosition() | |
fieldPosition3.field_position_type = "Regex" | |
fieldPosition3.regex = "Company name:" | |
field3.field_position = fieldPosition3 | |
field4 = groupdocs_parser_cloud.Field() | |
field4.field_name = "CompanyName" | |
fieldPosition4 = groupdocs_parser_cloud.FieldPosition() | |
fieldPosition4.field_position_type = "Linked" | |
fieldPosition4.linked_field_name = "Company" | |
fieldPosition4.is_right_linked = True | |
size4 = groupdocs_parser_cloud.Size() | |
size4.width = 100 | |
size4.height = 10 | |
fieldPosition4.search_area = size4 | |
fieldPosition4.auto_scale = True | |
field4.field_position = fieldPosition4 | |
table = groupdocs_parser_cloud.Table() | |
table.table_name = "Companies" | |
detectorparams = groupdocs_parser_cloud.DetectorParameters() | |
rect = groupdocs_parser_cloud.Rectangle() | |
size = groupdocs_parser_cloud.Size() | |
size.height = 60 | |
size.width = 480 | |
position = groupdocs_parser_cloud.Point() | |
position.x = 77 | |
position.y = 279 | |
rect.size = size | |
rect.position = position | |
detectorparams.rectangle = rect | |
table.detector_parameters = detectorparams | |
fields = [field1, field2, field3, field4] | |
tables = [table] | |
template = groupdocs_parser_cloud.Template() | |
template.fields = fields | |
template.tables = tables | |
return template |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create instance of the API | |
file_api = groupdocs_parser_cloud.FileApi.from_config(configuration) | |
# upload sample files | |
request = groupdocs_parser_cloud.UploadFileRequest("companies.pdf", "C:\\Files\\companies.pdf", my_storage) | |
response = file_api.upload_file(request) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment