PandaWhoCodes/url_to_txt.py

## url_to_txt.py
"""
Gets the webpage
Converts the HTML to a readable HTML using readability
Extracts the text and saves it to a text file.

usage - python url_to_txt.py http://example.com
"""
from readability import Document
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import sys


def get_website_html(url):
    return requests.get(url).content


def get_clean_html(html):
    doc = Document(html)
    return doc.summary()


def get_html_text(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.text


def save_text(text, filename):
    with open(filename, "w", encoding='utf-8') as f:
        f.write(text)


def get_filename_from(url):
    url_obj = urlparse(url)
    return url_obj.netloc + ".txt"


if __name__ == '__main__':
    url = sys.argv[1]
    file_name = get_filename_from(url)
    text = get_html_text(get_clean_html(get_website_html(url)))
    save_text(text, file_name)
	"""
	Gets the webpage
	Converts the HTML to a readable HTML using readability
	Extracts the text and saves it to a text file.

	usage - python url_to_txt.py http://example.com
	"""
	from readability import Document
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse
	import requests
	import sys


	def get_website_html(url):
	return requests.get(url).content


	def get_clean_html(html):
	doc = Document(html)
	return doc.summary()


	def get_html_text(html):
	soup = BeautifulSoup(html, "lxml")
	return soup.text


	def save_text(text, filename):
	with open(filename, "w", encoding='utf-8') as f:
	f.write(text)


	def get_filename_from(url):
	url_obj = urlparse(url)
	return url_obj.netloc + ".txt"


	if __name__ == '__main__':
	url = sys.argv[1]
	file_name = get_filename_from(url)
	text = get_html_text(get_clean_html(get_website_html(url)))
	save_text(text, file_name)