chand1012/get_all_text.py

## get_all_text.py
import os
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

base_url = "http://www.textfiles.com/etext/"

endings = ['MODERN', 'FICTION', 'NON-FICTION', 'REFERENCE']

if not os.path.exists('files'):
    os.makedirs('files')

urls = []

for ending in endings:
    url = f'{base_url}{ending}/'
    content = requests.get(f'{base_url}{ending}/')
    soup = BeautifulSoup(content.text, features="html.parser")
    links = soup.find_all('a')
    for link in links:
        urls.append(url + link.get('href'))

urls = list(set(urls))

for link in tqdm(urls):
    filename = link.split("/")[-1]
    with open(os.path.join(os.getcwd(), 'files', filename), 'wb') as f:
        content = requests.get(link)
        f.write(content.content)
	import os
	from tqdm import tqdm
	import requests
	from bs4 import BeautifulSoup

	base_url = "http://www.textfiles.com/etext/"

	endings = ['MODERN', 'FICTION', 'NON-FICTION', 'REFERENCE']

	if not os.path.exists('files'):
	os.makedirs('files')

	urls = []

	for ending in endings:
	url = f'{base_url}{ending}/'
	content = requests.get(f'{base_url}{ending}/')
	soup = BeautifulSoup(content.text, features="html.parser")
	links = soup.find_all('a')
	for link in links:
	urls.append(url + link.get('href'))

	urls = list(set(urls))

	for link in tqdm(urls):
	filename = link.split("/")[-1]
	with open(os.path.join(os.getcwd(), 'files', filename), 'wb') as f:
	content = requests.get(link)
	f.write(content.content)