Last active
December 30, 2019 14:17
-
-
Save jonathan-nwosu/35d87765ddd3f044e575da20e7fde798 to your computer and use it in GitHub Desktop.
Python web scraping and text summary tool - using simple machine learning techniques. Summarises articles from web pages.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib import request | |
from bs4 import BeautifulSoup as bs | |
import re | |
import nltk | |
import heapq | |
url = 'https://en.wikipedia.org/wiki/facebook' | |
allParagraphContent = "" | |
htmlDoc = request.urlopen(url) | |
soupObject = bs(htmlDoc, 'html.parser') | |
paragraphContents = soupObject.findAll('p') | |
for paragraphContent in paragraphContents: | |
allParagraphContent += paragraphContent.text | |
allParagraphContent_cleanerData = re.sub(r'\[[0-9]*\]',' ', allParagraphContent) | |
allParagraphContent_cleanedData = re.sub(r'\s+',' ', allParagraphContent_cleanerData) | |
sentences_tokens = nltk.sent_tokenize(allParagraphContent_cleanedData) | |
allParagraphContent_cleanedData = re.sub(r'[^a-zA-Z]', ' ', allParagraphContent_cleanedData) | |
allParagraphContent_cleanedData = re.sub(r'\s+',' ', allParagraphContent_cleanedData) | |
words_tokens = nltk.word_tokenize(allParagraphContent_cleanedData) | |
stopwords = nltk.corpus.stopwords.words('english') | |
word_frequencies = {} | |
for word in words_tokens: | |
if word not in stopwords: | |
if word not in word_frequencies.keys(): | |
word_frequencies[word] = 1 | |
else: | |
word_frequencies[word] +=1 | |
maximum_frequency_word = max(word_frequencies.values()) | |
for word in word_frequencies.keys(): | |
word_frequencies[word] = (word_frequencies[word]/maximum_frequency_word) | |
sentences_scores = { } | |
for sentence in sentences_tokens: | |
for word in nltk.word_tokenize(sentence.lower()): | |
if word in word_frequencies.keys(): | |
if len(sentence.split(' ')) < 100 : | |
if sentence not in sentences_scores.keys(): | |
sentences_scores[sentence] = word_frequencies[word] | |
else: | |
sentences_scores[sentence] += word_frequencies[word] | |
summary_MachineLearning = heapq.nlargest(15, sentences_scores, key=sentences_scores.get) | |
print(summary_MachineLearning) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment