tomasonjo/gutenberg_blog_preprocess

## gutenberg_blog_preprocess
# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda

# Fetch the data
target_url = 'https://www.gutenberg.org/files/95/95-0.txt'
import urllib.request
data = urllib.request.urlopen(target_url)
raw_data = data.read().decode('utf8').strip()

# Preprocess text into chapters
import re
chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]
chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]
	# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda

	# Fetch the data
	target_url = 'https://www.gutenberg.org/files/95/95-0.txt'
	import urllib.request
	data = urllib.request.urlopen(target_url)
	raw_data = data.read().decode('utf8').strip()

	# Preprocess text into chapters
	import re
	chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]
	chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]