Last active
September 13, 2023 00:05
-
-
Save normj/86e4eceffc14c183c6040a5705e3918b to your computer and use it in GitHub Desktop.
.NET Example for Amazon Textract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.IO; | |
using System.Threading.Tasks; | |
using System.Threading; | |
using Amazon; | |
// From the AWSSDK.Textract NuGet package | |
using Amazon.Textract; | |
using Amazon.Textract.Model; | |
// From the AWSSDK.S3 NuGet package | |
using Amazon.S3; | |
using Amazon.S3.Model; | |
namespace AWSTextractSample | |
{ | |
class Program | |
{ | |
static async Task Main(string[] args) | |
{ | |
try | |
{ | |
await StartDetectSampleAsync(); | |
await DetectSampleAsync(); | |
} | |
catch(Exception e) | |
{ | |
Console.WriteLine(e.Message); | |
} | |
} | |
private static async Task DetectSampleAsync() | |
{ | |
using (var textractClient = new AmazonTextractClient(RegionEndpoint.USEast1)) | |
{ | |
var bytes = File.ReadAllBytes("example.png"); | |
Console.WriteLine("Detect Document Text"); | |
var detectResponse = await textractClient.DetectDocumentTextAsync(new DetectDocumentTextRequest | |
{ | |
Document = new Document | |
{ | |
Bytes = new MemoryStream(bytes) | |
} | |
}); | |
foreach (var block in detectResponse.Blocks) | |
{ | |
Console.WriteLine($"Type {block.BlockType}, Text: {block.Text}"); | |
} | |
} | |
} | |
private static async Task StartDetectSampleAsync() | |
{ | |
// Set to a bucket that you actually own | |
var s3Bucket = "normj-east1"; | |
// The file name that will be uploaded to S3 and then sent to Textract | |
var localFile = "textract-dg.pdf"; | |
using (var textractClient = new AmazonTextractClient(RegionEndpoint.USEast1)) | |
using(var s3Client = new AmazonS3Client(RegionEndpoint.USEast1)) | |
{ | |
Console.WriteLine($"Upload {localFile} to {s3Bucket} bucket"); | |
var putRequest = new PutObjectRequest | |
{ | |
BucketName = s3Bucket, | |
FilePath = localFile, | |
Key = Path.GetFileName(localFile) | |
}; | |
await s3Client.PutObjectAsync(putRequest); | |
Console.WriteLine("Start document detection job"); | |
var startResponse = await textractClient.StartDocumentTextDetectionAsync(new StartDocumentTextDetectionRequest | |
{ | |
DocumentLocation = new DocumentLocation | |
{ | |
S3Object = new Amazon.Textract.Model.S3Object | |
{ | |
Bucket = s3Bucket, | |
Name = putRequest.Key | |
} | |
} | |
}); | |
Console.WriteLine($"Job ID: {startResponse.JobId}"); | |
var getDetectionRequest = new GetDocumentTextDetectionRequest | |
{ | |
JobId = startResponse.JobId | |
}; | |
Console.WriteLine("Poll for detect job to complete"); | |
// Poll till job is no longer in progress. | |
GetDocumentTextDetectionResponse getDetectionResponse = null; | |
do | |
{ | |
Thread.Sleep(1000); | |
getDetectionResponse = await textractClient.GetDocumentTextDetectionAsync(getDetectionRequest); | |
} while (getDetectionResponse.JobStatus == JobStatus.IN_PROGRESS); | |
Console.WriteLine("Print out results if the job was successful."); | |
// If the job was successful loop through the pages of results and print the detected text | |
if (getDetectionResponse.JobStatus == JobStatus.SUCCEEDED) | |
{ | |
do | |
{ | |
foreach (var block in getDetectionResponse.Blocks) | |
{ | |
Console.WriteLine($"Type {block.BlockType}, Text: {block.Text}"); | |
} | |
// Check to see if there are no more pages of data. If no then break. | |
if (string.IsNullOrEmpty(getDetectionResponse.NextToken)) | |
{ | |
break; | |
} | |
getDetectionRequest.NextToken = getDetectionResponse.NextToken; | |
getDetectionResponse = await textractClient.GetDocumentTextDetectionAsync(getDetectionRequest); | |
} while (!string.IsNullOrEmpty(getDetectionResponse.NextToken)); | |
} | |
else | |
{ | |
Console.WriteLine($"Job failed with message: {getDetectionResponse.StatusMessage}"); | |
} | |
} | |
} | |
} | |
} |
I don't have a lot of experience with ASHX files but I believe to use async/await
for an ASHX file you need your class to implement the IHttpAsyncHandler interface instead of IHttpHandler
interface.
You have a bug in this example. If there is more than one chunk, it will not read the last chunk of blocks. Line 122, you read in a chunk of blocks. If it's the last chunk, then the NextToken will not be set, but you won't process the chunk.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have been using TexTract since it was started but I am a fairly basic programmer (C#). I have never used the ASYNC processing. My programs loop with delay timers waiting for TexTract to complete. I love the simplicity of your example. But I can't figure out how to use it as a namespace with the AWAIT processing. My program runs as an ASHX program on a web server in C# v5. If I take your example how would I use the namespace to get the ASYNC advantages? The code just returns since I am not setting up the call correctly.
AWSTextractSample.Program.StartDetectSampleAsync();