Skip to content

Instantly share code, notes, and snippets.

@jonathan-nwosu
Last active December 30, 2019 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jonathan-nwosu/35d87765ddd3f044e575da20e7fde798 to your computer and use it in GitHub Desktop.
Save jonathan-nwosu/35d87765ddd3f044e575da20e7fde798 to your computer and use it in GitHub Desktop.
Python web scraping and text summary tool - using simple machine learning techniques. Summarises articles from web pages.
from urllib import request
from bs4 import BeautifulSoup as bs
import re
import nltk
import heapq
url = 'https://en.wikipedia.org/wiki/facebook'
allParagraphContent = ""
htmlDoc = request.urlopen(url)
soupObject = bs(htmlDoc, 'html.parser')
paragraphContents = soupObject.findAll('p')
for paragraphContent in paragraphContents:
allParagraphContent += paragraphContent.text
allParagraphContent_cleanerData = re.sub(r'\[[0-9]*\]',' ', allParagraphContent)
allParagraphContent_cleanedData = re.sub(r'\s+',' ', allParagraphContent_cleanerData)
sentences_tokens = nltk.sent_tokenize(allParagraphContent_cleanedData)
allParagraphContent_cleanedData = re.sub(r'[^a-zA-Z]', ' ', allParagraphContent_cleanedData)
allParagraphContent_cleanedData = re.sub(r'\s+',' ', allParagraphContent_cleanedData)
words_tokens = nltk.word_tokenize(allParagraphContent_cleanedData)
stopwords = nltk.corpus.stopwords.words('english')
word_frequencies = {}
for word in words_tokens:
if word not in stopwords:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] +=1
maximum_frequency_word = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/maximum_frequency_word)
sentences_scores = { }
for sentence in sentences_tokens:
for word in nltk.word_tokenize(sentence.lower()):
if word in word_frequencies.keys():
if len(sentence.split(' ')) < 100 :
if sentence not in sentences_scores.keys():
sentences_scores[sentence] = word_frequencies[word]
else:
sentences_scores[sentence] += word_frequencies[word]
summary_MachineLearning = heapq.nlargest(15, sentences_scores, key=sentences_scores.get)
print(summary_MachineLearning)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment