Skip to content

Instantly share code, notes, and snippets.

@Tokariew
Last active April 29, 2023 14:33
Show Gist options
  • Save Tokariew/b9ad5501b6a11ac3619ec045e4202d6f to your computer and use it in GitHub Desktop.
Save Tokariew/b9ad5501b6a11ac3619ec045e4202d6f to your computer and use it in GitHub Desktop.
#!/bin/python3
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from slugify import slugify as restring
from termcolor import colored
# script to download books from wolnelektury.pl site
base_url = 'https://wolnelektury.pl'
katalog_page = requests.get(f'{base_url}/katalog').content
soup = BeautifulSoup(katalog_page, 'lxml')
katalog = soup.find("div", {"class": "plain-list"})
books_links = katalog.find_all('a', href=True)
def get_book(book):
link = f'{base_url}{book["href"]}'
book_page = requests.get(link)
if book_page.status_code != 200:
print(colored(f'Error when getting: {link}', 'red'))
return
book_soup = BeautifulSoup(book_page.content, 'lxml')
try:
author = restring(
book_soup.find("div", {
"class": "l-header__content"
}).find("p").text.rstrip()
)
except TypeError:
author = 'NoAuthor'
title = restring(
book_soup.find("div", {
'class': 'l-header__content'
}).find("h1").string
)
down = book_soup.find("div", {'class': 'c-media__popup__box__items'})
try:
links = down.find_all('a', href=True)
except AttributeError:
print(colored(f'Error when getting: {link}', 'red'))
return
Path(f"{author}").mkdir(exist_ok=True)
path = Path(f"{author}/{title}")
if not path.exists():
print(colored(f'New book in {path}', 'green'))
path.mkdir(exist_ok=True)
for link in links:
link = link['href']
if '/media/' in link and not any(
elem in link for elem in ('daisy.zip', 'audio.epub')
):
pass
name = restring(link[link.rfind('/') + 1 :])
name = '.'.join(name.rsplit('-', 1))
f = path / Path(name)
with requests.get(f"{base_url}{link}", stream=True) as r:
total_length = int(r.headers.get('content-length'))
if f.exists():
if total_length == f.stat().st_size:
return
print(f)
with open(f, 'wb') as file:
for chunk in r.iter_content(chunk_size=1048576):
file.write(chunk)
with ThreadPoolExecutor(max_workers=72) as executor:
for book in executor.map(get_book, books_links):
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment