Skip to content

Instantly share code, notes, and snippets.

@onlurking
Last active March 4, 2018 18:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save onlurking/a553b3241c1e956cbf3a6ef90367174a to your computer and use it in GitHub Desktop.
Save onlurking/a553b3241c1e956cbf3a6ef90367174a to your computer and use it in GitHub Desktop.
crawl elder scrolls books and save as markdown (WIP)
from splinter import Browser
from bs4 import BeautifulSoup
import tomd
from html_sanitizer import Sanitizer
# pip install tomd splinter html-sanitizer beautifulsoup4
browser = Browser(
'chrome', user_agent="Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en)")
def get_books_list(url):
result = []
base = "https://www.imperial-library.info"
browser.visit(url)
soup = BeautifulSoup(browser.html, "html5lib")
content = soup.find_all('div', {
'class': 'view view-tes-game-books view-id-tes_game_books view-display-id-page_1 gamebooks view-dom-id-3fb27bded42aced1c8164ba6039b2947'})
lines = soup.find_all('li', {"class": "views-row"})
for th in lines:
result.extend(th.find_all('a'))
links = ["{}{}".format(base, c['href'])
for c in result if "/content" in c['href']]
return links
def get_book(url):
browser.visit(url)
soup = BeautifulSoup(browser.html, "html5lib")
body = soup.find_all(
"div", {"class": "views-row views-row-1 views-row-odd views-row-first"})
sanitizer = Sanitizer()
html = sanitizer.sanitize(body[0].prettify())
book = {"title": soup.find('h1', {"class": "page-title"}).text,
"author": soup.find('div', {"class": "field-item odd"}).text.split('\n')[-1].rstrip().strip(),
"tags": list(filter((lambda tag: True if len(tag) > 0 else False),
soup.find('ul', {"class": "links inline"}).text.split('\n'))),
"content": tomd.convert(html)}
return book
def write_book(book):
file = open("{} - {}.md".format(book['title'], book['author']), "w")
file.write(book['content'])
file.close()
links = get_books_list(
"https://www.imperial-library.info/books/all/by-category")
test = get_book(links[0])
write_book(test)
@onlurking
Copy link
Author

@terremoth, tu me fez descobrir um bug xP

completo aqui, mas o crawler só pegou o v1.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment