Skip to content

Instantly share code, notes, and snippets.

@xuzheng465
Created March 7, 2020 03:54
Show Gist options
  • Save xuzheng465/8407251b22639027fe4191d5e30870a1 to your computer and use it in GitHub Desktop.
Save xuzheng465/8407251b22639027fe4191d5e30870a1 to your computer and use it in GitHub Desktop.
仓鼠症爆发
import requests
from bs4 import BeautifulSoup
from pathlib import Path
url = "https://cs186berkeley.net/"
folder = Path("/Users/xuzheng/Desktop/self/cs186-database/test/")
pdfs = []
r = requests.get(url)
html = r.text
bs = BeautifulSoup(html, 'html.parser')
print(bs.h1)
for link in bs.find_all('a'):
cur_link = link.get('href')
if cur_link is not None:
if cur_link.endswith('pdf'):
if cur_link[0] != "/":
pdfs.append(url+cur_link)
else:
pdfs.append(url[:-1]+cur_link)
for url in pdfs:
file_to_write = url.split("/")[-1]
file_path = folder/file_to_write
r = requests.get(url, stream=True)
with open(file_path, 'wb') as f:
f.write(r.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment