Skip to content

Instantly share code, notes, and snippets.

@voltek62
Created August 12, 2021 15:23
Show Gist options
  • Save voltek62/19c36395cc501c578cc3bb3ad792ce8a to your computer and use it in GitHub Desktop.
Save voltek62/19c36395cc501c578cc3bb3ad792ce8a to your computer and use it in GitHub Desktop.
OpenAI Codex : """ 1. get all relative links from this web page in a list called links - https://www.oncrawl.com 2. go through links list 3. prefix "https://www.oncrawl.com" to each link 4. visit each link 5. extract text from the page 6. save the page text to a new list called pages 7. implement error handling. 8. implement unit tests import re…
"""
1. get all relative links from this web page in a list called links - https://www.oncrawl.com
2. go through links list
3. prefix "https://www.oncrawl.com" to each link
4. visit each link
5. extract text from the page
6. save the page text to a new list called pages
7. implement error handling.
8. implement unit tests
import requests
"""
import requests
from bs4 import BeautifulSoup
import re
import time
import random
def get_links(url):
"""
get all relative links from this web page in a list called links - https://www.oncrawl.com
"""
links = []
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
links.append(link.get('href'))
return links
def get_text(url):
"""
go through links list
prefix "https://www.oncrawl.com" to each link
visit each link
extract text from the page
save the page text to a new list called pages
"""
pages = []
for link in get_links(url):
if link.startswith('/'):
link = 'https://www.oncrawl.com' + link
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.find_all(text=True)
for t in text:
if t.parent.name not in ['[document]', 'head', 'script', 'style', 'meta']:
pages.append(t)
return pages
def get_words(pages):
"""
extract text from the page
save the page text to a new list called pages
"""
words = []
for page in pages:
words.extend(re.findall(r'\w+', page))
return words
def get_word_count(words):
"""
count the number of times each word appears in the list
"""
word_count = {}
for word in words:
if word.lower() in word_count:
word_count[word.lower()] += 1
else:
word_count[word.lower()] = 1
return word_count
def get_top_words(word_count, n):
"""
get the top n words from the word_count dictionary
"""
top_words = []
for key, value in sorted(word_count.items(), key=lambda item: item[1], reverse=True):
top_words.append((key, value))
return top_words[:n]
def main():
"""
main function
"""
url = 'https://www.oncrawl.com'
pages = get_text(url)
words = get_words(pages)
word_count = get_word_count(words)
top_words = get_top_words(word_count, 10)
print(top_words)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment