Skip to content

Instantly share code, notes, and snippets.

@shinysu
Last active February 12, 2022 13:26
Show Gist options
  • Save shinysu/0ff8d1cdf6173f2ab4ee3ec0746a5116 to your computer and use it in GitHub Desktop.
Save shinysu/0ff8d1cdf6173f2ab4ee3ec0746a5116 to your computer and use it in GitHub Desktop.
webPageAnalyzer
import PySimpleGUI as sg
from utils import get_statistics
layout = [
[sg.Text("Enter the URL: ", font=('Arial','16')),
sg.Input("", font=('Arial','16'), size=(40,1), key='url'),
sg.Button("Get Data", font=('Arial','16'), key='get')],
[sg.Multiline("", font=('Arial','16'), size=(70, 15), key='output')]
]
def display_analytics():
url = values['url']
statistics = get_statistics(url)
#{'lines': len(lines), 'words': len(words), 'unique_words': len(unique_words),
# 'most_common_words': most_common_words}
display_values(statistics)
def display_values(statistics):
window['output'].print("The web page contains the following information\n")
window['output'].print(statistics['lines'], "sentences")
window['output'].print(statistics['words'], "words")
window['output'].print(statistics['unique_words'], "unique words")
window['output'].print("The most common words are:")
for word, count in statistics['most_common_words']:
window['output'].print(word, "-", count)
if __name__ == '__main__':
window = sg.Window("WebPageAnalyzer", layout)
while True:
button, values = window.Read()
if button == sg.WINDOW_CLOSED:
break
elif button == 'get':
display_analytics()
window.Close()
import string
from webutils import get_html_content, parse_html_page
import re
from collections import Counter
def get_statistics(url):
content = get_html_content(url)
data = parse_html_page(content)
lines = get_lines(data)
words = get_words(lines)
unique_words = set(words)
most_common_words = get_most_common_words(words)
def get_lines(data):
lines =[]
for para in data:
para_lines = re.split(r'[.!?]+', para)
for line in para_lines:
if line != '':
lines.append(line)
return lines
def get_words(lines):
words = []
for line in lines:
cleaned_line = remove_punctuation(line)
words.extend(cleaned_line.split())
return words
def remove_punctuation(line):
st = str.maketrans("","",string.punctuation)
cleaned_line = line.translate(st).strip()
return cleaned_line
def get_most_common_words(words):
cleaned_words = []
stopwords = get_stop_words()
for word in words:
if word not in stopwords:
cleaned_words.append(word)
#print(cleaned_words)
print(Counter(cleaned_words).most_common(5))
def get_stop_words():
with open('/Users/shinysuresh/Documents/KCGPC_FirstYear/Batch2-PySimpleGUI/PySimpleGUI-Programs/webscrapping_demo/stopwords.txt','r') as fp:
words = fp.readlines()
stopwords = [word.strip('\n') for word in words]
return stopwords
if __name__ == "__main__":
url = "https://realpython.com/beautiful-soup-web-scraper-python/"
get_statistics(url)
import requests
from bs4 import BeautifulSoup
def get_html_content(url):
page = requests.get(url)
return page.content
def parse_html_page(content):
data = []
soup = BeautifulSoup(content, 'html.parser')
para_data = soup.find_all('p')
for para in para_data:
data.append(para.text)
#print(data)
return data
if __name__ == "__main__":
url = "https://realpython.com/beautiful-soup-web-scraper-python/"
content = get_html_content(url)
parse_html_page(content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment