Skip to content

Instantly share code, notes, and snippets.

@Musinux
Created July 8, 2019 10:51
Show Gist options
  • Save Musinux/fbd417961fbe93df873a7c1343b56d2d to your computer and use it in GitHub Desktop.
Save Musinux/fbd417961fbe93df873a7c1343b56d2d to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import regex
def get(url):
mycontent = requests.get(url)
soup = BeautifulSoup(mycontent.text, "html.parser")
return soup
# the links we want to visit
my_list_of_links = ["https://www.nytimes.com/2019/07/02/dining/ice-cream-shops.html"]
# the links that we don't want to visit twice
already_visited = []
# while we still have links to visit, continue
while len(my_list_of_links) > 0:
# get the first element of the list and remove it from the list
current_link = my_list_of_links.pop(0)
# add the link to the "already_visited" list, to avoid visiting it twice
already_visited.append(current_link)
# get the html content from the link (URL)
html = get(current_link)
# for tomorrow, find a way to fill all these variables from the "html" object
# I recommend that you check out the BeautifulSoup documentation to find what you need
# you also need to read the html content to find the right html tags to extract
url =
title =
content =
writing_time =
author =
crawling_time =
links =
keywords =
base_path = regex.match(r'^(.*/)[^/]*', current_link)[1]
domain_name = regex.match(r'^[a-z]+://[^/]*', current_link)[0]
print("base_path", base_path)
print("domain_name", domain_name)
# to put everything in a database, we need to:
# 1) connect to the db at the top of the script
# 2) insert the values in this loop each time
# 3) don't forget to commit after the execute
# retrieve all the <a> tags to get new links
all_a_tags = html.find_all('a')
for tag in all_a_tags:
link = tag.get('href') # should look like https://www.nytimes.com
# only add the link if it's not empty and if it's from the same website
# for now, this condition is not valid for relative links
# we need to add an option that validates relative links => use regular expressions
# 1) match the link against the regular expression r'(([a-z]+)://([^/]*))?(.*)'
match = regex.match(r'(([a-z]+)://([^/]*))?(.*)', link)
# recall that match[0] is the whole string, match[2] is the protocol, match[3] is the domain name, match[4] is the path
if match and not match[1] and match[4]: # if something matches but only the path, not the rest
# it is a relative link
if match[4][0] == '/': # it's relative to the root of the webserver
link = domain_name + link
else: # it's relative to our current path
# we need to get the base path of our current URL
link = base_path + link
print("link", link)
# 2) if everything None except the path (match[4]), it's a relative link
# 3) if it's a relative link, do what is on the board
if link is not None and "https://www.nytimes.com" in link:
# only add the link if we didn't already visit it
if link not in already_visited: # find the right condition here
my_list_of_links.append(link)
print("number of visited links", len(already_visited))
print("number of links", len(my_list_of_links))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment