meatyite/wikia-dl.py

## wikia-dl.py
#!/usr/bin/python3

# requires beautifulsoup4, install it with pip if you don't have it already
# usage: ./wikia-dl.py [wikia article url]

import re
import requests
from sys import argv
from bs4 import BeautifulSoup as bs

def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens so it could be saved properly in a windows filename.
    """
    import unicodedata
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = str(re.sub('[^\w\s-]', '', value).strip().lower())
    value = str(re.sub('[-\s]+', '-', value))
    return value

def download(url):
    total_html = """\
<!DOCTYPE html>
<html>
<head>
"""
    pageSource = requests.get(url).text
    soup = bs(pageSource, 'html.parser')
    title = soup.find('h1', {'class':'page-header__title'}).string
    total_html += "\n<title>" + title + "</title>\n</head>\n<body>\n<h1>" + title + "</h1>"
    htmlChildTags = soup.find('div', {'id':'mw-content-text'}).findChildren(recursive=True)
    for htmlChildTag in htmlChildTags:
        total_html += "\n" + str(htmlChildTag)
    total_html += "\n</body>\n</html>"
    open(slugify(title) + '.html', 'wb').write(total_html.encode('utf-8'))

download(argv[1])
	#!/usr/bin/python3

	# requires beautifulsoup4, install it with pip if you don't have it already
	# usage: ./wikia-dl.py [wikia article url]

	import re
	import requests
	from sys import argv
	from bs4 import BeautifulSoup as bs

	def slugify(value):
	"""
	Normalizes string, converts to lowercase, removes non-alpha characters,
	and converts spaces to hyphens so it could be saved properly in a windows filename.
	"""
	import unicodedata
	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
	value = str(re.sub('[^\w\s-]', '', value).strip().lower())
	value = str(re.sub('[-\s]+', '-', value))
	return value

	def download(url):
	total_html = """\
	<!DOCTYPE html>
	<html>
	<head>
	"""
	pageSource = requests.get(url).text
	soup = bs(pageSource, 'html.parser')
	title = soup.find('h1', {'class':'page-header__title'}).string
	total_html += "\n<title>" + title + "</title>\n</head>\n<body>\n<h1>" + title + "</h1>"
	htmlChildTags = soup.find('div', {'id':'mw-content-text'}).findChildren(recursive=True)
	for htmlChildTag in htmlChildTags:
	total_html += "\n" + str(htmlChildTag)
	total_html += "\n</body>\n</html>"
	open(slugify(title) + '.html', 'wb').write(total_html.encode('utf-8'))

	download(argv[1])