Skip to content

Instantly share code, notes, and snippets.

@PandaWhoCodes
Created July 16, 2020 07:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save PandaWhoCodes/2e7aa52a1f49e9f0cfc153dd0be579ab to your computer and use it in GitHub Desktop.
Save PandaWhoCodes/2e7aa52a1f49e9f0cfc153dd0be579ab to your computer and use it in GitHub Desktop.
Extracts the text from a webpage and saves it to a text file
"""
Gets the webpage
Converts the HTML to a readable HTML using readability
Extracts the text and saves it to a text file.
usage - python url_to_txt.py http://example.com
"""
from readability import Document
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import sys
def get_website_html(url):
return requests.get(url).content
def get_clean_html(html):
doc = Document(html)
return doc.summary()
def get_html_text(html):
soup = BeautifulSoup(html, "lxml")
return soup.text
def save_text(text, filename):
with open(filename, "w", encoding='utf-8') as f:
f.write(text)
def get_filename_from(url):
url_obj = urlparse(url)
return url_obj.netloc + ".txt"
if __name__ == '__main__':
url = sys.argv[1]
file_name = get_filename_from(url)
text = get_html_text(get_clean_html(get_website_html(url)))
save_text(text, file_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment