Skip to content

Instantly share code, notes, and snippets.

@rudda
Created February 5, 2023 04:01
Show Gist options
  • Save rudda/9d65b44c22b42f812b0366983a6f5175 to your computer and use it in GitHub Desktop.
Save rudda/9d65b44c22b42f812b0366983a6f5175 to your computer and use it in GitHub Desktop.
Code Template Read PDF File from URL by Python
import requests
import io
from io import BytesIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def download_pdf(url):
response = requests.get(url)
return response.content
def extract_text_from_pdf(pdf_content):
resource_manager = PDFResourceManager()
string_io = io.StringIO()
converter = TextConverter(resource_manager, string_io, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)
text = ''
pdf_io = BytesIO(pdf_content)
for page in PDFPage.get_pages(pdf_io, caching=True, check_extractable=True):
page_interpreter.process_page(page)
text += string_io.getvalue()
string_io.truncate(0)
string_io.seek(0)
converter.close()
string_io.close()
return text
if __name__ == '__main__':
url = 'http://www.host.com/assets/file.pdf'
pdf_content = download_pdf(url)
text = extract_text_from_pdf(pdf_content)
print(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment