voltek62/gist:19c36395cc501c578cc3bb3ad792ce8a

## gistfile1.txt
"""
1. get all relative links from this web page in a list called links - https://www.oncrawl.com
2. go through links list
3. prefix "https://www.oncrawl.com" to each link
4. visit each link
5. extract text from the page
6. save the page text to a new list called pages
7. implement error handling.
8. implement unit tests
import requests
"""

import requests
from bs4 import BeautifulSoup
import re
import time
import random

def get_links(url):
    """
    get all relative links from this web page in a list called links - https://www.oncrawl.com
    """
    links = []
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a'):
        links.append(link.get('href'))
    return links

def get_text(url):
    """
    go through links list
    prefix "https://www.oncrawl.com" to each link
    visit each link
    extract text from the page
    save the page text to a new list called pages
    """
    pages = []
    for link in get_links(url):
        if link.startswith('/'):
            link = 'https://www.oncrawl.com' + link
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.find_all(text=True)
        for t in text:
            if t.parent.name not in ['[document]', 'head', 'script', 'style', 'meta']:
                pages.append(t)
    return pages

def get_words(pages):
    """
    extract text from the page
    save the page text to a new list called pages
    """
    words = []
    for page in pages:
        words.extend(re.findall(r'\w+', page))
    return words

def get_word_count(words):
    """
    count the number of times each word appears in the list
    """
    word_count = {}
    for word in words:
        if word.lower() in word_count:
            word_count[word.lower()] += 1
        else:
            word_count[word.lower()] = 1
    return word_count

def get_top_words(word_count, n):
    """
    get the top n words from the word_count dictionary
    """
    top_words = []
    for key, value in sorted(word_count.items(), key=lambda item: item[1], reverse=True):
        top_words.append((key, value))
    return top_words[:n]

def main():
    """
    main function
    """
    url = 'https://www.oncrawl.com'
    pages = get_text(url)
    words = get_words(pages)
    word_count = get_word_count(words)
    top_words = get_top_words(word_count, 10)
    print(top_words)

if __name__ == '__main__':
    main()
	"""
	1. get all relative links from this web page in a list called links - https://www.oncrawl.com
	2. go through links list
	3. prefix "https://www.oncrawl.com" to each link
	4. visit each link
	5. extract text from the page
	6. save the page text to a new list called pages
	7. implement error handling.
	8. implement unit tests
	import requests
	"""

	import requests
	from bs4 import BeautifulSoup
	import re
	import time
	import random

	def get_links(url):
	"""
	get all relative links from this web page in a list called links - https://www.oncrawl.com
	"""
	links = []
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')
	for link in soup.find_all('a'):
	links.append(link.get('href'))
	return links

	def get_text(url):
	"""
	go through links list
	prefix "https://www.oncrawl.com" to each link
	visit each link
	extract text from the page
	save the page text to a new list called pages
	"""
	pages = []
	for link in get_links(url):
	if link.startswith('/'):
	link = 'https://www.oncrawl.com' + link
	response = requests.get(link)
	soup = BeautifulSoup(response.text, 'html.parser')
	text = soup.find_all(text=True)
	for t in text:
	if t.parent.name not in ['[document]', 'head', 'script', 'style', 'meta']:
	pages.append(t)
	return pages

	def get_words(pages):
	"""
	extract text from the page
	save the page text to a new list called pages
	"""
	words = []
	for page in pages:
	words.extend(re.findall(r'\w+', page))
	return words

	def get_word_count(words):
	"""
	count the number of times each word appears in the list
	"""
	word_count = {}
	for word in words:
	if word.lower() in word_count:
	word_count[word.lower()] += 1
	else:
	word_count[word.lower()] = 1
	return word_count

	def get_top_words(word_count, n):
	"""
	get the top n words from the word_count dictionary
	"""
	top_words = []
	for key, value in sorted(word_count.items(), key=lambda item: item[1], reverse=True):
	top_words.append((key, value))
	return top_words[:n]

	def main():
	"""
	main function
	"""
	url = 'https://www.oncrawl.com'
	pages = get_text(url)
	words = get_words(pages)
	word_count = get_word_count(words)
	top_words = get_top_words(word_count, 10)
	print(top_words)

	if __name__ == '__main__':
	main()