Skip to content

Instantly share code, notes, and snippets.

@ehzawad
Created December 16, 2023 19:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ehzawad/0bb796629057db5d840fee0ecfb292ce to your computer and use it in GitHub Desktop.
Save ehzawad/0bb796629057db5d840fee0ecfb292ce to your computer and use it in GitHub Desktop.
```python
from PyPDF2 import PdfReader, PdfWriter
def extract_pages(pdf_path, start_page, end_page, output_path):
reader = PdfReader(pdf_path)
writer = PdfWriter()
# Page numbers are zero-indexed in PyPDF2, hence the -1
for page_num in range(start_page - 1, end_page):
writer.add_page(reader.pages[page_num])
with open(output_path, 'wb') as output_pdf:
writer.write(output_pdf)
# Example usage
extract_pages('/Users/ehz/Downloads/psyche.pdf', 293+35, 336+35, 'consciousness.pdf')
```
@ehzawad
Copy link
Author

ehzawad commented Dec 18, 2023

import requests
import os
import pdfkit

class Page:
    def __init__(self):
        self.pdfOptions = {
            'page-size': 'Letter',
            # 'margin-top': '0.75in',
            # 'margin-right': '0.75in',
            # 'margin-bottom': '0.75in',
            # 'margin-left': '0.75in',
            # 'javascript-delay': 2000,
            # 'minimum-font-size': 12
        }
        self.targetDir = os.path.dirname(os.path.realpath(__file__))

    def getArticle(self, articleTitle):
        URL = "https://en.wikipedia.org/w/api.php"
        PARAMS = {
            "action": "query",
            "format": "json",
            "titles": articleTitle,
            "prop": "extracts",
            "explaintext": True
        }

        try:
            response = requests.get(url=URL, params=PARAMS)
            response.raise_for_status()
            data = response.json()
            page = next(iter(data['query']['pages'].values()))
            return page['extract'] if 'extract' in page else None
        except requests.RequestException as e:
            print(f"An error occurred: {e}")
            return None

    def download(self, articleTitle):
        content = self.getArticle(articleTitle)
        if content:
            filename = os.path.join(self.targetDir, articleTitle.replace('/', '_') + '.pdf')
            pdfkit.from_string(content, filename, options=self.pdfOptions)
            print(f"Downloaded: {filename}")
        else:
            print("Article not found or an error occurred")

# Usage Example
pageDownloader = Page()
pageDownloader.download("Python (programming language)")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment