Skip to content

Instantly share code, notes, and snippets.

Created April 21, 2018 20:09
Show Gist options
  • Save lhsfcboy/38155a81e063a8015bd6e666a42cb58c to your computer and use it in GitHub Desktop.
Save lhsfcboy/38155a81e063a8015bd6e666a42cb58c to your computer and use it in GitHub Desktop.
#! python
from bs4 import BeautifulSoup
import requests
target_urls = {
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
for prefix, url in target_urls.items():
# print(prefix)
img_index = 1
r = requests.get(url, headers = headers)
content = r.text
soup = BeautifulSoup(content, "html5lib")
main_div = soup.find("div", class_="RichText Post-RichText")
# print(main_div.prettify())
# print("-"*20)
# print("-"*20)
# print("-"*20)
file_string = ""
for child in main_div.children:
# print(
if == "h2":
# print("##",child.get_text())
file_string = file_string + "## " + child.get_text() + "\n\n"
elif != "figure":
# print(child.get_text())
file_string = file_string + child.get_text() + "\n\n"
if == "figure":
img_url = child.find("img")["data-actualsrc"]
img_name = f"lec{prefix}_fg{img_index:02}.jpg"
img_index += 1
img_path = f"images/{img_name}"
# img_path = img_name
img_md = f"![pass]({img_path})"
# print("-"*20)
# print(child.find("img"))
# print("-"*20)
# print(img_url)
# print("-"*20)
file_string = file_string + img_md + "\n\n"
response = requests.get(img_url, headers = headers)
with open(img_path, 'wb') as f:
# print(img_md)
# print(file_string)
md_file_name = prefix + ".md"
with open(md_file_name, 'w',encoding='utf-8') as f:
# print(file_string, file=f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment