Skip to content

Instantly share code, notes, and snippets.

@brandonko
Last active September 2, 2020 19:00
Show Gist options
  • Save brandonko/dec39b12600352144f58165bab4ab22a to your computer and use it in GitHub Desktop.
Save brandonko/dec39b12600352144f58165bab4ab22a to your computer and use it in GitHub Desktop.
Extracts the text from HTML files
import os
import re
from boilerpy3 import extractors
# Condenses all repeating newline characters into one single newline character
def condense_newline(text):
return '\n'.join([p for p in re.split('\n|\r', text) if len(p) > 0])
# Returns the text from a HTML file
def parse_html(html_path):
# Text extraction with boilerpy3
html_extractor = extractors.ArticleExtractor()
return condense_newline(html_extractor.get_content_from_file(html_path))
# Extracts the text from all html files in a specified directory
def html_to_text(folder):
parsed_texts = []
filepaths = os.listdir(folder)
for filepath in filepaths:
filepath_full = os.path.join(folder, filepath)
if filepath_full.endswith(".html"):
parsed_texts.append(parse_html(filepath_full))
return parsed_texts
# Your directory to the folder with scraped websites
scraped_dir = './scraped_pages'
parsed_texts = html_to_text(scraped_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment