This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pypdf import PdfReader | |
# Path to the PDF file | |
pdf_file = "pdf-to-extract-text/input.pdf" | |
# Create a PDF reader object | |
reader = PdfReader(pdf_file) | |
# Loop through the pages and extract text | |
for page_number, page in enumerate(reader.pages, start=1): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import required libraries | |
from pdf2image import convert_from_path | |
from PIL import Image | |
# Specify the path to the PDF file | |
pdf_path = 'sample.pdf' | |
# Convert PDF to a list of images | |
try: | |
images = convert_from_path(pdf_path) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pypdf import PdfReader | |
# Path to the PDF file | |
pdf_file = "pdf-to-extract-text/input.pdf" | |
# Create a PDF reader object | |
reader = PdfReader(pdf_file) | |
# Loop through the pages and extract text | |
for page_number, page in enumerate(reader.pages, start=1): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Extracted Text using PyMuPDF: | |
This is a sample pdf. Page 1 | |
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been | |
the | |
industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type | |
and scrambled it to make a type specimen book. It has survived not only five centuries, but also the | |
leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with | |
the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop | |
publishing software like Aldus PageMaker including versions of Lorem Ipsum. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fitz # PyMuPDF library | |
# Specify the PDF file path | |
pdf_file_path = "sample.pdf" | |
# Open the PDF file | |
pdf_document = fitz.open(pdf_file_path) | |
# Initialize a variable to store the extracted text | |
extracted_text = "" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Extracted Text using pypdf: | |
This is a sample pdf. Page 1 | |
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the | |
industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type | |
and scrambled it to make a type specimen book. It has survived not only five centuries, but also the | |
leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with | |
the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop | |
publishing software like Aldus PageMaker including versions of Lorem Ipsum. | |
This is sample pdf. Page 2 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pypdf import PdfReader | |
# Specify the PDF file path | |
pdf_file_path = "sample.pdf" | |
# Create a PDF reader object | |
reader = PdfReader(pdf_file_path) | |
# Initialize a variable to store the extracted text | |
extracted_text = "" |
NewerOlder