Skip to content

Instantly share code, notes, and snippets.

@iodbh
Last active May 15, 2018 17:24
Show Gist options
  • Save iodbh/246bb58e459577dd0e4bd39a1e56e997 to your computer and use it in GitHub Desktop.
Save iodbh/246bb58e459577dd0e4bd39a1e56e997 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import os
from sys import argv
DECOMPOSE_TAGS = (
'div.menu-panel',
'head',
'li',
'p.month-name',
'table.days-of-month',
'div.comment',
'p.source',
'div.notes',
'span',
'script'
)
def extract_text(input_file, output_file):
soup = BeautifulSoup(open(input_file),"html.parser")
for selector in DECOMPOSE_TAGS:
for tag in soup.select(selector):
tag.decompose()
with open(output_file, 'a') as f:
print(soup.get_text(), file=f)
f.close()
if __name__ == '__main__:
input_directory = argv[1]
output_file = argv[2]
for file in os.listdir(input_directory):
if file.endswith('.html'):
file_path = os.path.join(intput_directory, file)
extract_text(input_file, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment