Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created April 6, 2022 21:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save thisismattmiller/40fc028bd7f329cf03e3a6f2bb322951 to your computer and use it in GitHub Desktop.
Save thisismattmiller/40fc028bd7f329cf03e3a6f2bb322951 to your computer and use it in GitHub Desktop.
Code for video demo: https://youtu.be/TDTmlGyeNp8
import requests
import shutil
import camelot.io as camelot
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for year in range(2011,2017):
url = f'https://files.dep.state.pa.us/Waste/Recycling/RecyclingPortalFiles/Documents/{year}_Recycling_Report.pdf'
print(url)
file_name = f"data/{year}.pdf"
with requests.get(url, stream=True) as r:
with open(file_name, 'wb') as f:
shutil.copyfileobj(r.raw, f)
pdf = PdfFileReader(file_name)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = f"data/{year}_{page}.pdf"
print(output_filename)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
# once its here indivual page exists
tables = camelot.read_pdf(output_filename)
print(len(tables))
tables[0].to_csv(f"csv/{year}_{page}.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment