Skip to content

Instantly share code, notes, and snippets.

@meatyite
Last active May 8, 2019 13:21
Show Gist options
  • Save meatyite/7258808d30185a1c8008fb1622ad6fcc to your computer and use it in GitHub Desktop.
Save meatyite/7258808d30185a1c8008fb1622ad6fcc to your computer and use it in GitHub Desktop.
wikia (fandom.com) Downloader
#!/usr/bin/python3
# requires beautifulsoup4, install it with pip if you don't have it already
# usage: ./wikia-dl.py [wikia article url]
import re
import requests
from sys import argv
from bs4 import BeautifulSoup as bs
def slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens so it could be saved properly in a windows filename.
"""
import unicodedata
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = str(re.sub('[^\w\s-]', '', value).strip().lower())
value = str(re.sub('[-\s]+', '-', value))
return value
def download(url):
total_html = """\
<!DOCTYPE html>
<html>
<head>
"""
pageSource = requests.get(url).text
soup = bs(pageSource, 'html.parser')
title = soup.find('h1', {'class':'page-header__title'}).string
total_html += "\n<title>" + title + "</title>\n</head>\n<body>\n<h1>" + title + "</h1>"
htmlChildTags = soup.find('div', {'id':'mw-content-text'}).findChildren(recursive=True)
for htmlChildTag in htmlChildTags:
total_html += "\n" + str(htmlChildTag)
total_html += "\n</body>\n</html>"
open(slugify(title) + '.html', 'wb').write(total_html.encode('utf-8'))
download(argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment