The following code is for scraping content from websites and extracting just the words. This is useful for being able to feed web content into other processes.
This can be accomplished in a three step process.
- Get the raw html content using the
requests
library - Feed the
.text
results of step one into theBeautifulSoup
and extract the text with.get_text()
. This will strip all the html from the content and return and unstructured string. - The string that is returned will need some heavy sanitization.
- Strip the blank lines with a python filter
lines = filter(lambda x: x.strip(), text.splitlines())
- Remove all the leading and trailing white spaces
words = map(lambda x: x.strip(), lines)
- Remove any special characters that could mess up future models
words = map(lambda x: x.replace(".", "") .replace(",", "") .replace("'", "") .replace(":", "") .replace("?", "") .replace('“', "") .replace('”', "") .replace("!", "") .replace(";", "") .replace("——", "") .replace("_", "") .replace("(", "") .replace(")", "") .replace("|", "") .replace("<", ""), words)
- Split the words from sentences into tokens
words = map(lambda x: x.split(), words)
- Lower case all the words to ensure that mixed cases do not mess up models down the road.
words = map(lambda x: [y.lower() for y in x], words)
- The above will return a list of lists which will need to be flattened
words = [item for sublist in words for item in sublist]
def tokenize_word_from_url(url: str) -> list:
"""
get the words from the url
:param url: url to scrape
:return: list of words
"""
try:
content = requests.get(url).text
soup = BeautifulSoup(content, "html.parser")
text = soup.get_text()
# remove all the blank lines
lines = filter(lambda x: x.strip(), text.splitlines())
# remove leading and trailing spaces
words = map(lambda x: x.strip(), lines)
# remove all special characters
words = map(lambda x: x.replace(".", "")
.replace(",", "")
.replace("'", "")
.replace(":", "")
.replace("?", "")
.replace('“', "")
.replace('”', "")
.replace("!", "")
.replace(";", "")
.replace("——", "")
.replace("_", "")
.replace("(", "")
.replace(")", "")
.replace("|", "")
.replace("<", ""), words)
# split the words
words = map(lambda x: x.split(), words)
# lower case the words
words = map(lambda x: [y.lower() for y in x], words)
# flatten the list
words = [item for sublist in words for item in sublist]
return words
except Exception as e:
print(e)