Created
July 16, 2020 07:37
-
-
Save PandaWhoCodes/2e7aa52a1f49e9f0cfc153dd0be579ab to your computer and use it in GitHub Desktop.
Extracts the text from a webpage and saves it to a text file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Gets the webpage | |
Converts the HTML to a readable HTML using readability | |
Extracts the text and saves it to a text file. | |
usage - python url_to_txt.py http://example.com | |
""" | |
from readability import Document | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
import requests | |
import sys | |
def get_website_html(url): | |
return requests.get(url).content | |
def get_clean_html(html): | |
doc = Document(html) | |
return doc.summary() | |
def get_html_text(html): | |
soup = BeautifulSoup(html, "lxml") | |
return soup.text | |
def save_text(text, filename): | |
with open(filename, "w", encoding='utf-8') as f: | |
f.write(text) | |
def get_filename_from(url): | |
url_obj = urlparse(url) | |
return url_obj.netloc + ".txt" | |
if __name__ == '__main__': | |
url = sys.argv[1] | |
file_name = get_filename_from(url) | |
text = get_html_text(get_clean_html(get_website_html(url))) | |
save_text(text, file_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment