Created
February 5, 2023 04:01
-
-
Save rudda/9d65b44c22b42f812b0366983a6f5175 to your computer and use it in GitHub Desktop.
Code Template Read PDF File from URL by Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import io | |
from io import BytesIO | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
def download_pdf(url): | |
response = requests.get(url) | |
return response.content | |
def extract_text_from_pdf(pdf_content): | |
resource_manager = PDFResourceManager() | |
string_io = io.StringIO() | |
converter = TextConverter(resource_manager, string_io, laparams=LAParams()) | |
page_interpreter = PDFPageInterpreter(resource_manager, converter) | |
text = '' | |
pdf_io = BytesIO(pdf_content) | |
for page in PDFPage.get_pages(pdf_io, caching=True, check_extractable=True): | |
page_interpreter.process_page(page) | |
text += string_io.getvalue() | |
string_io.truncate(0) | |
string_io.seek(0) | |
converter.close() | |
string_io.close() | |
return text | |
if __name__ == '__main__': | |
url = 'http://www.host.com/assets/file.pdf' | |
pdf_content = download_pdf(url) | |
text = extract_text_from_pdf(pdf_content) | |
print(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment