Skip to content

Instantly share code, notes, and snippets.

@noveoko
Created August 27, 2017 11:19
Show Gist options
  • Save noveoko/61dae3759574140416a8ce8140e30325 to your computer and use it in GitHub Desktop.
Save noveoko/61dae3759574140416a8ce8140e30325 to your computer and use it in GitHub Desktop.
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup as bs
url = 'https://en.wikipedia.org/wiki/Human'
topic = url.split('/')[-1]
text = requests.get(url)
text = text.content
soup = bs(text, 'lxml')
text = soup.find_all('p')
all_words = ''
for node in soup.findAll('p'):
all_words += ''.join(node.findAll(text=True))
blob = TextBlob(all_words)
phrases = sorted(set(list(blob.noun_phrases)))
with open('{} words'.format(topic), 'w') as file:
for p in phrases:
if '[' in p or ']' in p or ':' in p or "'" in p or '"' in p:
pass
else:
file.write('{}\n'.format(p))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment