wsricardo/main.html

## main.html
<!DOCTYPE html>
<html>
<head>
<title></title>
</head>

<body>

    <a href="sas.svg">BannerImage</a>
    <a href="http://www.google.com">Google</a>
    <a href="https://www.dimensaoalfa.com.br">Notícias Dimensão ALfa</a>
    <a href="www.wsricardo.blogspot.com">Meu Blog</a>
    <a href="#">Banner</a>
    <a href="noticias/n1.html">Notícias</a>
    <a href="http://youtube.com">Youtube></a>
    <a href="https://wikipedia.org">Wikipédia</a>
</body>
</html>

## spider.py
"""
    Author: WSRicardo
    Youtube: https://www.youtube.com/@dimensaoalfa
    Blog: https://wsricardo.blogspot.com
    Site: www.dimensaoalfa.com.br
    Github: www.github.com/wsricardo
"""

from bs4 import BeautifulSoup

def get_links( webcontent ):
    """
        get_links
        arguments
        webcoment - html text
        return list[ {'url':'', 'text':'' } ]

        Return list links found in html content.
    """


    soup = BeautifulSoup(webcontent, 'html.parser' )

    linksall = soup.find_all( 'a' )
    links = [ ]

    for link in linksall:
        l = link.get( 'href' )
        if l:
            if  ( 'http' in l or 'https' in l ) and len( link.text ) > 3 :
                links.append( {
                    'url': l,
                    'text': link.text.strip( )

                    } )
    return links

if __name__== "__main__":
    html = ''

    with open( "main.html", "r", encoding='utf-8' ) as fl:
        html = fl.read()

    print( html )
    print( get_links( html ) )
	<!DOCTYPE html>
	<html>
	<head>
	<title></title>
	</head>

	<body>

	<a href="sas.svg">BannerImage</a>
	<a href="http://www.google.com">Google</a>
	<a href="https://www.dimensaoalfa.com.br">Notícias Dimensão ALfa</a>
	<a href="www.wsricardo.blogspot.com">Meu Blog</a>
	<a href="#">Banner</a>
	<a href="noticias/n1.html">Notícias</a>
	<a href="http://youtube.com">Youtube></a>
	<a href="https://wikipedia.org">Wikipédia</a>
	</body>
	</html>
	"""
	Author: WSRicardo
	Youtube: https://www.youtube.com/@dimensaoalfa
	Blog: https://wsricardo.blogspot.com
	Site: www.dimensaoalfa.com.br
	Github: www.github.com/wsricardo
	"""

	from bs4 import BeautifulSoup

	def get_links( webcontent ):
	"""
	get_links
	arguments
	webcoment - html text
	return list[ {'url':'', 'text':'' } ]

	Return list links found in html content.
	"""


	soup = BeautifulSoup(webcontent, 'html.parser' )

	linksall = soup.find_all( 'a' )
	links = [ ]

	for link in linksall:
	l = link.get( 'href' )
	if l:
	if ( 'http' in l or 'https' in l ) and len( link.text ) > 3 :
	links.append( {
	'url': l,
	'text': link.text.strip( )

	} )
	return links

	if __name__== "__main__":
	html = ''

	with open( "main.html", "r", encoding='utf-8' ) as fl:
	html = fl.read()

	print( html )
	print( get_links( html ) )