Skip to content

Instantly share code, notes, and snippets.

@chand1012
Last active January 26, 2021 16:56
Show Gist options
  • Save chand1012/158acd0c5307ac9deb501cfb71387d3c to your computer and use it in GitHub Desktop.
Save chand1012/158acd0c5307ac9deb501cfb71387d3c to your computer and use it in GitHub Desktop.
Gets a lot of text from http://www.textfiles.com/etext/
import os
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
base_url = "http://www.textfiles.com/etext/"
endings = ['MODERN', 'FICTION', 'NON-FICTION', 'REFERENCE']
if not os.path.exists('files'):
os.makedirs('files')
urls = []
for ending in endings:
url = f'{base_url}{ending}/'
content = requests.get(f'{base_url}{ending}/')
soup = BeautifulSoup(content.text, features="html.parser")
links = soup.find_all('a')
for link in links:
urls.append(url + link.get('href'))
urls = list(set(urls))
for link in tqdm(urls):
filename = link.split("/")[-1]
with open(os.path.join(os.getcwd(), 'files', filename), 'wb') as f:
content = requests.get(link)
f.write(content.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment