Skip to content

Instantly share code, notes, and snippets.

@cacharle
Created June 27, 2020 07:24
Show Gist options
  • Save cacharle/629a688a5e183b8ea6a8429ce8c89dec to your computer and use it in GitHub Desktop.
Save cacharle/629a688a5e183b8ea6a8429ce8c89dec to your computer and use it in GitHub Desktop.
Script to scrape http://tldp.org/HOWTO/NCURSES-Programming-HOWTO/ because I have bad internet
import os
import requests
from bs4 import BeautifulSoup
url_base = "http://tldp.org/HOWTO/NCURSES-Programming-HOWTO"
dir_name = "ncurses_howto"
respond = requests.get("http://tldp.org/HOWTO/NCURSES-Programming-HOWTO")
if respond.status_code != 200:
raise IOError
content = respond.content
try:
os.mkdir(dir_name)
except:
pass
with open(os.path.join(dir_name, "index.html"), "wb") as f:
f.write(content)
# content = ""
# with open("ncurses.html") as f:
# content = f.read()
soup = BeautifulSoup(content, "html.parser")
dt = soup.find_all(class_="TOC")[0].dl
a_tags = dt.find_all("a")
links = [os.path.join(url_base, a["href"]) for a in a_tags if a["href"].find("#") == -1]
for l in links:
print(f"requesting {l}")
respond = requests.get(l)
if respond.status_code != 200:
raise IOError
file_name = os.path.join(dir_name, os.path.basename(l))
with open(file_name, "wb") as f:
f.write(respond.content)
print(f"{l} saved to {file_name}")
@cacharle
Copy link
Author

Still a fun exercice tho

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment