Skip to content

Instantly share code, notes, and snippets.

@dizcza
Last active September 21, 2019 20:57
Show Gist options
  • Save dizcza/23f0984e884ab8f5af26f5b66a02ab90 to your computer and use it in GitHub Desktop.
Save dizcza/23f0984e884ab8f5af26f5b66a02ab90 to your computer and use it in GitHub Desktop.
Download best books from ukrlib
import time
import urllib.request
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import trange, tqdm
url_page_template = "https://ukrlib.com.ua/books/rating.php?so=1&page={page_id}"
url_download_template = "https://ukrlib.com.ua/books/getfile.php?tid={tid}&type=3"
BEST_BOOKS_DIR = Path(__file__).parent / "best"
BEST_BOOKS_DIR.mkdir(exist_ok=True)
def download_books_on_page(page_id: int):
url_page = url_page_template.format(page_id=page_id)
with urllib.request.urlopen(url_page) as response:
html = response.read().decode("ISO-8859-1")
soup = BeautifulSoup(html, 'html.parser')
prefix_skip_len = len('printit.php?tid=')
fname_prefix = "attachment; filename="
for a_href in tqdm(soup.find_all('a', attrs={"class": "sr-author"}), desc=f"Page {page_id:02d}", leave=False):
tid = a_href.get("href")[prefix_skip_len:]
url_download = url_download_template.format(tid=tid)
with urllib.request.urlopen(url_download) as response:
fname = response.info().get('Content-Disposition', f"{fname_prefix}{tid}.docx")
content = response.read()
fname = fname[len(fname_prefix):]
with open(BEST_BOOKS_DIR / fname, 'wb') as f:
f.write(content)
time.sleep(0.01)
if __name__ == "__main__":
for page_id in trange(1, 21, desc=f"Downloading best books to {BEST_BOOKS_DIR}"):
download_books_on_page(page_id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment