Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Get Latin Library Texts
from glob2 import glob
from bs4 import BeautifulSoup
from os.path import basename
import codecs, os
target = 'www.thelatinlibrary.com'
os.system('wget -r -l 4 {0}'.format(target))
os.system('cd {0} && mkdir plaintext && find . -name "*.html" -size -3k -delete'.format(target))
for i in glob('*.html') + glob('*/**.html') + glob('*.shtml') + glob('**/*.shtml'):
with codecs.open(i, 'r', 'latin1') as f:
soup = BeautifulSoup(f)
[j.extract() for j in soup.find_all('table')]
f = basename(i).replace('.html','').replace('.shtml', '') + '.txt'
if len(i.split('/')) > 1:
f = i.split('/')[-2] + f
with open('plaintext/' + f, 'w') as out:
out.write( ' '.join(soup.get_text().split()) )
@sroertgen

This comment has been minimized.

Copy link

sroertgen commented Feb 27, 2019

for MacOS users:
install 'wget' via hombrew install wget

@infamicstudios

This comment has been minimized.

Copy link

infamicstudios commented Mar 15, 2019

Should be with open('plaintext/' + f, 'w+') as out:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.