Musinux/homework.py

## homework.py
from bs4 import BeautifulSoup
import requests
import regex

def get(url):
    mycontent = requests.get(url)
    soup = BeautifulSoup(mycontent.text, "html.parser")
    return soup

# the links we want to visit
my_list_of_links = ["https://www.nytimes.com/2019/07/02/dining/ice-cream-shops.html"]
# the links that we don't want to visit twice
already_visited = []

# while we still have links to visit, continue
while len(my_list_of_links) > 0:
    # get the first element of the list and remove it from the list
    current_link = my_list_of_links.pop(0)
    # add the link to the "already_visited" list, to avoid visiting it twice
    already_visited.append(current_link)
    # get the html content from the link (URL)
    html = get(current_link)

    # for tomorrow, find a way to fill all these variables from the "html" object
    # I recommend that you check out the BeautifulSoup documentation to find what you need
    # you also need to read the html content to find the right html tags to extract
    url =
    title =
    content =
    writing_time =
    author =
    crawling_time =
    links =
    keywords =

    base_path = regex.match(r'^(.*/)[^/]*', current_link)[1]
    domain_name = regex.match(r'^[a-z]+://[^/]*', current_link)[0]

    print("base_path", base_path)
    print("domain_name", domain_name)
    # to put everything in a database, we need to:
    # 1) connect to the db at the top of the script
    # 2) insert the values in this loop each time
    # 3) don't forget to commit after the execute
    # retrieve all the <a> tags to get new links
    all_a_tags = html.find_all('a')
    for tag in all_a_tags:
        link = tag.get('href') # should look like https://www.nytimes.com
        # only add the link if it's not empty and if it's from the same website
        # for now, this condition is not valid for relative links
        # we need to add an option that validates relative links => use regular expressions
        # 1) match the link against the regular expression r'(([a-z]+)://([^/]*))?(.*)'
        match = regex.match(r'(([a-z]+)://([^/]*))?(.*)', link)
        # recall that match[0] is the whole string, match[2] is the protocol, match[3] is the domain name, match[4] is the path
        if match and not match[1] and match[4]: # if something matches but only the path, not the rest
            # it is a relative link
            if match[4][0] == '/': # it's relative to the root of the webserver
                link = domain_name + link
            else: # it's relative to our current path
                # we need to get the base path of our current URL
                link = base_path + link
        print("link", link)
        # 2) if everything None except the path (match[4]), it's a relative link
        # 3) if it's a relative link, do what is on the board
        if link is not None and "https://www.nytimes.com" in link:
            # only add the link if we didn't already visit it
            if link not in already_visited: # find the right condition here
                my_list_of_links.append(link)

    print("number of visited links", len(already_visited))
    print("number of links", len(my_list_of_links))
	from bs4 import BeautifulSoup
	import requests
	import regex

	def get(url):
	mycontent = requests.get(url)
	soup = BeautifulSoup(mycontent.text, "html.parser")
	return soup

	# the links we want to visit
	my_list_of_links = ["https://www.nytimes.com/2019/07/02/dining/ice-cream-shops.html"]
	# the links that we don't want to visit twice
	already_visited = []

	# while we still have links to visit, continue
	while len(my_list_of_links) > 0:
	# get the first element of the list and remove it from the list
	current_link = my_list_of_links.pop(0)
	# add the link to the "already_visited" list, to avoid visiting it twice
	already_visited.append(current_link)
	# get the html content from the link (URL)
	html = get(current_link)

	# for tomorrow, find a way to fill all these variables from the "html" object
	# I recommend that you check out the BeautifulSoup documentation to find what you need
	# you also need to read the html content to find the right html tags to extract
	url =
	title =
	content =
	writing_time =
	author =
	crawling_time =
	links =
	keywords =

	base_path = regex.match(r'^(./)[^/]', current_link)[1]
	domain_name = regex.match(r'^[a-z]+://[^/]*', current_link)[0]

	print("base_path", base_path)
	print("domain_name", domain_name)
	# to put everything in a database, we need to:
	# 1) connect to the db at the top of the script
	# 2) insert the values in this loop each time
	# 3) don't forget to commit after the execute
	# retrieve all the <a> tags to get new links
	all_a_tags = html.find_all('a')
	for tag in all_a_tags:
	link = tag.get('href') # should look like https://www.nytimes.com
	# only add the link if it's not empty and if it's from the same website
	# for now, this condition is not valid for relative links
	# we need to add an option that validates relative links => use regular expressions
	# 1) match the link against the regular expression r'(([a-z]+)://([^/]))?(.)'
	match = regex.match(r'(([a-z]+)://([^/]))?(.)', link)
	# recall that match[0] is the whole string, match[2] is the protocol, match[3] is the domain name, match[4] is the path
	if match and not match[1] and match[4]: # if something matches but only the path, not the rest
	# it is a relative link
	if match[4][0] == '/': # it's relative to the root of the webserver
	link = domain_name + link
	else: # it's relative to our current path
	# we need to get the base path of our current URL
	link = base_path + link
	print("link", link)
	# 2) if everything None except the path (match[4]), it's a relative link
	# 3) if it's a relative link, do what is on the board
	if link is not None and "https://www.nytimes.com" in link:
	# only add the link if we didn't already visit it
	if link not in already_visited: # find the right condition here
	my_list_of_links.append(link)

	print("number of visited links", len(already_visited))
	print("number of links", len(my_list_of_links))