Created
August 12, 2021 15:23
-
-
Save voltek62/19c36395cc501c578cc3bb3ad792ce8a to your computer and use it in GitHub Desktop.
OpenAI Codex : """ 1. get all relative links from this web page in a list called links - https://www.oncrawl.com 2. go through links list 3. prefix "https://www.oncrawl.com" to each link 4. visit each link 5. extract text from the page 6. save the page text to a new list called pages 7. implement error handling. 8. implement unit tests import re…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
1. get all relative links from this web page in a list called links - https://www.oncrawl.com | |
2. go through links list | |
3. prefix "https://www.oncrawl.com" to each link | |
4. visit each link | |
5. extract text from the page | |
6. save the page text to a new list called pages | |
7. implement error handling. | |
8. implement unit tests | |
import requests | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import time | |
import random | |
def get_links(url): | |
""" | |
get all relative links from this web page in a list called links - https://www.oncrawl.com | |
""" | |
links = [] | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
for link in soup.find_all('a'): | |
links.append(link.get('href')) | |
return links | |
def get_text(url): | |
""" | |
go through links list | |
prefix "https://www.oncrawl.com" to each link | |
visit each link | |
extract text from the page | |
save the page text to a new list called pages | |
""" | |
pages = [] | |
for link in get_links(url): | |
if link.startswith('/'): | |
link = 'https://www.oncrawl.com' + link | |
response = requests.get(link) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
text = soup.find_all(text=True) | |
for t in text: | |
if t.parent.name not in ['[document]', 'head', 'script', 'style', 'meta']: | |
pages.append(t) | |
return pages | |
def get_words(pages): | |
""" | |
extract text from the page | |
save the page text to a new list called pages | |
""" | |
words = [] | |
for page in pages: | |
words.extend(re.findall(r'\w+', page)) | |
return words | |
def get_word_count(words): | |
""" | |
count the number of times each word appears in the list | |
""" | |
word_count = {} | |
for word in words: | |
if word.lower() in word_count: | |
word_count[word.lower()] += 1 | |
else: | |
word_count[word.lower()] = 1 | |
return word_count | |
def get_top_words(word_count, n): | |
""" | |
get the top n words from the word_count dictionary | |
""" | |
top_words = [] | |
for key, value in sorted(word_count.items(), key=lambda item: item[1], reverse=True): | |
top_words.append((key, value)) | |
return top_words[:n] | |
def main(): | |
""" | |
main function | |
""" | |
url = 'https://www.oncrawl.com' | |
pages = get_text(url) | |
words = get_words(pages) | |
word_count = get_word_count(words) | |
top_words = get_top_words(word_count, 10) | |
print(top_words) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment