simonw/ocr-pdf.md Secret

## ocr-pdf.md

      
    Raw
  

              ocr-pdf.md
            
          
    2024-03-29T20:50:47    conversation: 01ht5zpx4tdx6xd00y03mxz7wj

Model: claude-3-opus-20240229
Prompt:

This code shows how to open a PDF and turn it into an image per page:
<!DOCTYPE html>
<html>
<head>
  <title>PDF to Images</title>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js"></script>
  <style>
    .image-container img {
      margin-bottom: 10px;
    }
    .image-container p {
      margin: 0;
      font-size: 14px;
      color: #888;
    }
  </style>
</head>
<body>
  <input type="file" id="fileInput" accept=".pdf" />
  <div class="image-container"></div>

  <script>
  const desiredWidth = 800;
    const fileInput = document.getElementById('fileInput');
    const imageContainer = document.querySelector('.image-container');

    fileInput.addEventListener('change', handleFileUpload);

    pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.worker.min.js';

    async function handleFileUpload(event) {
      const file = event.target.files[0];
      const imageIterator = convertPDFToImages(file);

      for await (const { imageURL, size } of imageIterator) {
        const imgElement = document.createElement('img');
        imgElement.src = imageURL;
        imageContainer.appendChild(imgElement);

        const sizeElement = document.createElement('p');
        sizeElement.textContent = `Size: ${formatSize(size)}`;
        imageContainer.appendChild(sizeElement);
      }
    }

    async function* convertPDFToImages(file) {
      try {
        const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise;
        const numPages = pdf.numPages;

        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const viewport = page.getViewport({ scale: 1 });
          const canvas = document.createElement('canvas');
          const context = canvas.getContext('2d');
          canvas.width = desiredWidth;
          canvas.height = (desiredWidth / viewport.width) * viewport.height;
          const renderContext = {
            canvasContext: context,
            viewport: page.getViewport({ scale: desiredWidth / viewport.width }),
          };
          await page.render(renderContext).promise;
          const imageURL = canvas.toDataURL('image/jpeg', 0.8);
          const size = calculateSize(imageURL);
          yield { imageURL, size };
        }
      } catch (error) {
        console.error('Error:', error);
      }
    }

    function calculateSize(imageURL) {
      const base64Length = imageURL.length - 'data:image/jpeg;base64,'.length;
      const sizeInBytes = Math.ceil(base64Length * 0.75);
      return sizeInBytes;
    }

    function formatSize(size) {
      const sizeInKB = (size / 1024).toFixed(2);
      return `${sizeInKB} KB`;
    }
  </script>
</body>
</html>
This code shows how to OCR an image:
async function ocrMissingAltText() {
    // Load Tesseract
    var s = document.createElement("script");
    s.src = "https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js";
    document.head.appendChild(s);

    s.onload = async () => {
      const images = document.getElementsByTagName("img");
      const worker = Tesseract.createWorker();
      await worker.load();
      await worker.loadLanguage("eng");
      await worker.initialize("eng");
      ocrButton.innerText = "Running OCR...";

      // Iterate through all the images in the output div
      for (const img of images) {
        const altTextarea = img.parentNode.querySelector(".textarea-alt");
        // Check if the alt textarea is empty
        if (altTextarea.value === "") {
          const imageUrl = img.src;
          var {
            data: { text },
          } = await worker.recognize(imageUrl);
          altTextarea.value = text; // Set the OCR result to the alt textarea
          progressBar.value += 1;
        }
      }

      await worker.terminate();
      ocrButton.innerText = "OCR complete";
    };
  }
Use these examples to put together a single HTML page with embedded HTML and CSS and JavaScript that provides a big square which users can drag and drop a PDF file onto and when they do that the PDF has every page converted to a JPEG and shown below on the page, then OCR is run with tesseract and the results are shown in textarea blocks below each image.
Response:

Here's an HTML page that combines the PDF to image conversion and OCR functionality using the provided code examples:
<!DOCTYPE html>
<html>
<head>
  <title>PDF to Images with OCR</title>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js"></script>
  <script src="https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js"></script>
  <style>
    .dropzone {
      width: 400px;
      height: 400px;
      border: 2px dashed #ccc;
      display: flex;
      justify-content: center;
      align-items: center;
      font-size: 24px;
      cursor: pointer;
    }
    .image-container img {
      margin-bottom: 10px;
    }
    .image-container p {
      margin: 0;
      font-size: 14px;
      color: #888;
    }
    .textarea-alt {
      width: 100%;
      height: 100px;
      margin-bottom: 20px;
    }
  </style>
</head>
<body>
  <div class="dropzone" id="dropzone">
    Drag and drop PDF file here
  </div>
  <div class="image-container"></div>

  <script>
    const desiredWidth = 800;
    const dropzone = document.getElementById('dropzone');
    const imageContainer = document.querySelector('.image-container');

    dropzone.addEventListener('dragover', handleDragOver);
    dropzone.addEventListener('drop', handleDrop);

    pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.worker.min.js';

    async function handleDragOver(event) {
      event.preventDefault();
    }

    async function handleDrop(event) {
      event.preventDefault();
      const file = event.dataTransfer.files[0];
      const imageIterator = convertPDFToImages(file);

      for await (const { imageURL, size } of imageIterator) {
        const imgElement = document.createElement('img');
        imgElement.src = imageURL;
        imageContainer.appendChild(imgElement);

        const sizeElement = document.createElement('p');
        sizeElement.textContent = `Size: ${formatSize(size)}`;
        imageContainer.appendChild(sizeElement);

        const altTextarea = document.createElement('textarea');
        altTextarea.classList.add('textarea-alt');
        imageContainer.appendChild(altTextarea);
      }

      await ocrImages();
    }

    async function* convertPDFToImages(file) {
      try {
        const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise;
        const numPages = pdf.numPages;

        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const viewport = page.getViewport({ scale: 1 });
          const canvas = document.createElement('canvas');
          const context = canvas.getContext('2d');
          canvas.width = desiredWidth;
          canvas.height = (desiredWidth / viewport.width) * viewport.height;
          const renderContext = {
            canvasContext: context,
            viewport: page.getViewport({ scale: desiredWidth / viewport.width }),
          };
          await page.render(renderContext).promise;
          const imageURL = canvas.toDataURL('image/jpeg', 0.8);
          const size = calculateSize(imageURL);
          yield { imageURL, size };
        }
      } catch (error) {
        console.error('Error:', error);
      }
    }

    function calculateSize(imageURL) {
      const base64Length = imageURL.length - 'data:image/jpeg;base64,'.length;
      const sizeInBytes = Math.ceil(base64Length * 0.75);
      return sizeInBytes;
    }

    function formatSize(size) {
      const sizeInKB = (size / 1024).toFixed(2);
      return `${sizeInKB} KB`;
    }

    async function ocrImages() {
      const images = document.getElementsByTagName("img");
      const worker = Tesseract.createWorker();
      await worker.load();
      await worker.loadLanguage("eng");
      await worker.initialize("eng");

      for (const img of images) {
        const altTextarea = img.parentNode.querySelector(".textarea-alt");
        if (altTextarea.value === "") {
          const imageUrl = img.src;
          var {
            data: { text },
          } = await worker.recognize(imageUrl);
          altTextarea.value = text;
        }
      }

      await worker.terminate();
    }
  </script>
</body>
</html>
This code combines the PDF to image conversion and OCR functionality into a single HTML page. Here's how it works:

The user can drag and drop a PDF file onto the designated dropzone.
When a file is dropped, the handleDrop function is triggered, which converts each page of the PDF into a JPEG image using the convertPDFToImages function.
Each converted image is displayed on the page along with its size information.
Below each image, an empty textarea is created to store the OCR result.
After all the images are converted and displayed, the ocrImages function is called, which uses Tesseract.js to perform OCR on each image.
The OCR result for each image is then inserted into the corresponding textarea.

Note: Make sure to include the necessary JavaScript libraries (pdf.js and Tesseract.js) in the code for it to work properly.
This HTML page provides a user-friendly interface for converting a PDF file to images, performing OCR on those images, and displaying the results below each image.
2024-03-29T20:57:39

Prompt:

Modify this to also have a file input that can be used - dropping a file onto the drop area fills that input
make the drop zone 100% wide but have a 2em padding on the body. it should be 10em high. it should turn pink when an image is dragged over it.
Each textarea should be 100% wide and 10em high
At the very bottom of the page add a h2 that says Full document - then a 30em high textarea with all of the page text in it separated by two newlines
Response:

Here's the modified HTML code with the requested changes:
<!DOCTYPE html>
<html>
<head>
  <title>PDF to Images with OCR</title>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js"></script>
  <script src="https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js"></script>
  <style>
    body {
      padding: 2em;
    }
    .dropzone {
      width: 100%;
      height: 10em;
      border: 2px dashed #ccc;
      display: flex;
      justify-content: center;
      align-items: center;
      font-size: 24px;
      cursor: pointer;
    }
    .dropzone.drag-over {
      background-color: pink;
    }
    .image-container img {
      margin-bottom: 10px;
    }
    .image-container p {
      margin: 0;
      font-size: 14px;
      color: #888;
    }
    .textarea-alt {
      width: 100%;
      height: 10em;
      margin-bottom: 20px;
    }
    .full-document {
      width: 100%;
      height: 30em;
      margin-top: 20px;
    }
  </style>
</head>
<body>
  <input type="file" id="fileInput" accept=".pdf" style="display: none;" />
  <div class="dropzone" id="dropzone">
    Drag and drop PDF file here or click to select a file
  </div>
  <div class="image-container"></div>
  <h2>Full document</h2>
  <textarea class="full-document" id="fullDocument"></textarea>

  <script>
    const desiredWidth = 800;
    const dropzone = document.getElementById('dropzone');
    const fileInput = document.getElementById('fileInput');
    const imageContainer = document.querySelector('.image-container');
    const fullDocumentTextarea = document.getElementById('fullDocument');

    dropzone.addEventListener('dragover', handleDragOver);
    dropzone.addEventListener('dragleave', handleDragLeave);
    dropzone.addEventListener('drop', handleDrop);
    dropzone.addEventListener('click', handleClick);

    pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.worker.min.js';

    async function handleDragOver(event) {
      event.preventDefault();
      dropzone.classList.add('drag-over');
    }

    async function handleDragLeave(event) {
      event.preventDefault();
      dropzone.classList.remove('drag-over');
    }

    async function handleDrop(event) {
      event.preventDefault();
      dropzone.classList.remove('drag-over');
      const file = event.dataTransfer.files[0];
      fileInput.files = event.dataTransfer.files;
      processFile(file);
    }

    async function handleClick() {
      fileInput.click();
    }

    fileInput.addEventListener('change', (event) => {
      const file = event.target.files[0];
      processFile(file);
    });

    async function processFile(file) {
      const imageIterator = convertPDFToImages(file);
      let fullText = '';

      for await (const { imageURL, size } of imageIterator) {
        const imgElement = document.createElement('img');
        imgElement.src = imageURL;
        imageContainer.appendChild(imgElement);

        const sizeElement = document.createElement('p');
        sizeElement.textContent = `Size: ${formatSize(size)}`;
        imageContainer.appendChild(sizeElement);

        const altTextarea = document.createElement('textarea');
        altTextarea.classList.add('textarea-alt');
        imageContainer.appendChild(altTextarea);

        const { text } = await ocrImage(imageURL);
        altTextarea.value = text;
        fullText += text + '\n\n';
      }

      fullDocumentTextarea.value = fullText.trim();
    }

    async function* convertPDFToImages(file) {
      try {
        const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise;
        const numPages = pdf.numPages;

        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const viewport = page.getViewport({ scale: 1 });
          const canvas = document.createElement('canvas');
          const context = canvas.getContext('2d');
          canvas.width = desiredWidth;
          canvas.height = (desiredWidth / viewport.width) * viewport.height;
          const renderContext = {
            canvasContext: context,
            viewport: page.getViewport({ scale: desiredWidth / viewport.width }),
          };
          await page.render(renderContext).promise;
          const imageURL = canvas.toDataURL('image/jpeg', 0.8);
          const size = calculateSize(imageURL);
          yield { imageURL, size };
        }
      } catch (error) {
        console.error('Error:', error);
      }
    }

    function calculateSize(imageURL) {
      const base64Length = imageURL.length - 'data:image/jpeg;base64,'.length;
      const sizeInBytes = Math.ceil(base64Length * 0.75);
      return sizeInBytes;
    }

    function formatSize(size) {
      const sizeInKB = (size / 1024).toFixed(2);
      return `${sizeInKB} KB`;
    }

    async function ocrImage(imageUrl) {
      const worker = Tesseract.createWorker();
      await worker.load();
      await worker.loadLanguage("eng");
      await worker.initialize("eng");

      const {
        data: { text },
      } = await worker.recognize(imageUrl);

      await worker.terminate();
      return { text };
    }
  </script>
</body>
</html>
Here are the changes made to the code:

Added a hidden file input element that gets filled when a file is dropped onto the dropzone or clicked.
Modified the dropzone styles to be 100% wide, have a 10em height, and turn pink when an image is dragged over it.
Added event listeners for drag over, drag leave, drop, and click events on the dropzone.
When a file is selected (either by dropping or clicking), the processFile function is called to handle the file.
Each textarea is now 100% wide and 10em high.
At the bottom of the page, added an h2 element that says "Full document" and a 30em high textarea with the ID fullDocument.
After OCR is performed on each image, the text is appended to the fullText variable with two newline characters as separators.
Finally, the fullText is set as the value of the fullDocument textarea.

Now, users can either drag and drop a PDF file onto the dropzone or click on it to select a file using the file input. The selected file will be processed, converted to images, OCR will be performed on each image, and the results will be displayed in the individual textareas as well as in the full document textarea at the bottom of the page.
2024-03-29T20:59:41

Prompt:

get rid of the code that shows image sizes. Set the placeholder on each textarea to be Processing... and clear that placeholder when the job is done.
Response:

Here's the updated HTML code with the requested changes:
<!DOCTYPE html>
<html>
<head>
  <title>PDF to Images with OCR</title>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js"></script>
  <script src="https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js"></script>
  <style>
    body {
      padding: 2em;
    }
    .dropzone {
      width: 100%;
      height: 10em;
      border: 2px dashed #ccc;
      display: flex;
      justify-content: center;
      align-items: center;
      font-size: 24px;
      cursor: pointer;
    }
    .dropzone.drag-over {
      background-color: pink;
    }
    .image-container img {
      margin-bottom: 10px;
    }
    .textarea-alt {
      width: 100%;
      height: 10em;
      margin-bottom: 20px;
    }
    .full-document {
      width: 100%;
      height: 30em;
      margin-top: 20px;
    }
  </style>
</head>
<body>
  <input type="file" id="fileInput" accept=".pdf" style="display: none;" />
  <div class="dropzone" id="dropzone">
    Drag and drop PDF file here or click to select a file
  </div>
  <div class="image-container"></div>
  <h2>Full document</h2>
  <textarea class="full-document" id="fullDocument"></textarea>

  <script>
    const desiredWidth = 800;
    const dropzone = document.getElementById('dropzone');
    const fileInput = document.getElementById('fileInput');
    const imageContainer = document.querySelector('.image-container');
    const fullDocumentTextarea = document.getElementById('fullDocument');

    dropzone.addEventListener('dragover', handleDragOver);
    dropzone.addEventListener('dragleave', handleDragLeave);
    dropzone.addEventListener('drop', handleDrop);
    dropzone.addEventListener('click', handleClick);

    pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.worker.min.js';

    async function handleDragOver(event) {
      event.preventDefault();
      dropzone.classList.add('drag-over');
    }

    async function handleDragLeave(event) {
      event.preventDefault();
      dropzone.classList.remove('drag-over');
    }

    async function handleDrop(event) {
      event.preventDefault();
      dropzone.classList.remove('drag-over');
      const file = event.dataTransfer.files[0];
      fileInput.files = event.dataTransfer.files;
      processFile(file);
    }

    async function handleClick() {
      fileInput.click();
    }

    fileInput.addEventListener('change', (event) => {
      const file = event.target.files[0];
      processFile(file);
    });

    async function processFile(file) {
      const imageIterator = convertPDFToImages(file);
      let fullText = '';

      for await (const { imageURL } of imageIterator) {
        const imgElement = document.createElement('img');
        imgElement.src = imageURL;
        imageContainer.appendChild(imgElement);

        const altTextarea = document.createElement('textarea');
        altTextarea.classList.add('textarea-alt');
        altTextarea.placeholder = 'Processing...';
        imageContainer.appendChild(altTextarea);

        const { text } = await ocrImage(imageURL);
        altTextarea.value = text;
        altTextarea.placeholder = '';
        fullText += text + '\n\n';
      }

      fullDocumentTextarea.value = fullText.trim();
    }

    async function* convertPDFToImages(file) {
      try {
        const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise;
        const numPages = pdf.numPages;

        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const viewport = page.getViewport({ scale: 1 });
          const canvas = document.createElement('canvas');
          const context = canvas.getContext('2d');
          canvas.width = desiredWidth;
          canvas.height = (desiredWidth / viewport.width) * viewport.height;
          const renderContext = {
            canvasContext: context,
            viewport: page.getViewport({ scale: desiredWidth / viewport.width }),
          };
          await page.render(renderContext).promise;
          const imageURL = canvas.toDataURL('image/jpeg', 0.8);
          yield { imageURL };
        }
      } catch (error) {
        console.error('Error:', error);
      }
    }

    async function ocrImage(imageUrl) {
      const worker = Tesseract.createWorker();
      await worker.load();
      await worker.loadLanguage("eng");
      await worker.initialize("eng");

      const {
        data: { text },
      } = await worker.recognize(imageUrl);

      await worker.terminate();
      return { text };
    }
  </script>
</body>
</html>
Here are the changes made to the code:

Removed the code that shows image sizes.
Set the placeholder attribute of each textarea element to "Processing..." initially.
After OCR is performed on each image and the text is set as the value of the corresponding textarea, the placeholder is cleared.

Now, when a PDF file is processed, each textarea will display "Processing..." as a placeholder while OCR is being performed. Once the OCR is complete and the text is set as the value of the textarea, the placeholder will be cleared.