cacharle/scrape_ncurses_howto.py

## scrape_ncurses_howto.py
import os
import requests
from bs4 import BeautifulSoup

url_base = "http://tldp.org/HOWTO/NCURSES-Programming-HOWTO"
dir_name = "ncurses_howto"

respond = requests.get("http://tldp.org/HOWTO/NCURSES-Programming-HOWTO")
if respond.status_code != 200:
        raise IOError
content = respond.content

try:
    os.mkdir(dir_name)
except:
    pass

with open(os.path.join(dir_name, "index.html"), "wb") as f:
    f.write(content)

# content = ""
# with open("ncurses.html") as f:
#     content = f.read()

soup = BeautifulSoup(content, "html.parser")
dt = soup.find_all(class_="TOC")[0].dl

a_tags = dt.find_all("a")
links = [os.path.join(url_base, a["href"]) for a in a_tags if a["href"].find("#") == -1]

for l in links:
    print(f"requesting {l}")
    respond = requests.get(l)
    if respond.status_code != 200:
        raise IOError
    file_name = os.path.join(dir_name, os.path.basename(l))
    with open(file_name, "wb") as f:
        f.write(respond.content)
    print(f"{l} saved to {file_name}")
	import os
	import requests
	from bs4 import BeautifulSoup

	url_base = "http://tldp.org/HOWTO/NCURSES-Programming-HOWTO"
	dir_name = "ncurses_howto"

	respond = requests.get("http://tldp.org/HOWTO/NCURSES-Programming-HOWTO")
	if respond.status_code != 200:
	raise IOError
	content = respond.content

	try:
	os.mkdir(dir_name)
	except:
	pass

	with open(os.path.join(dir_name, "index.html"), "wb") as f:
	f.write(content)

	# content = ""
	# with open("ncurses.html") as f:
	# content = f.read()

	soup = BeautifulSoup(content, "html.parser")
	dt = soup.find_all(class_="TOC")[0].dl

	a_tags = dt.find_all("a")
	links = [os.path.join(url_base, a["href"]) for a in a_tags if a["href"].find("#") == -1]

	for l in links:
	print(f"requesting {l}")
	respond = requests.get(l)
	if respond.status_code != 200:
	raise IOError
	file_name = os.path.join(dir_name, os.path.basename(l))
	with open(file_name, "wb") as f:
	f.write(respond.content)
	print(f"{l} saved to {file_name}")