otavio-s-s/crawler.py

## crawler.py
from bs4 import BeautifulSoup
import requests

pages_crawled = []


def crawler(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    links = soup.find_all('a')

    for link in links:
        if 'href' in link.attrs:
            if link['href'].startswith('/wiki') and ':' not in link['href']:
                if link['href'] not in pages_crawled:
                    new_link = f"https://en.wikipedia.org{link['href']}"
                    pages_crawled.append(link['href'])
                    try:
                        with open('data.csv', 'a') as file:
                            file.write(f'{soup.title.text}; {soup.h1.text}; {link["href"]}\n')
                        crawler(new_link)
                    except:
                        continue


crawler('https://en.wikipedia.org')
	from bs4 import BeautifulSoup
	import requests

	pages_crawled = []


	def crawler(url):
	page = requests.get(url)
	soup = BeautifulSoup(page.text, 'html.parser')
	links = soup.find_all('a')

	for link in links:
	if 'href' in link.attrs:
	if link['href'].startswith('/wiki') and ':' not in link['href']:
	if link['href'] not in pages_crawled:
	new_link = f"https://en.wikipedia.org{link['href']}"
	pages_crawled.append(link['href'])
	try:
	with open('data.csv', 'a') as file:
	file.write(f'{soup.title.text}; {soup.h1.text}; {link["href"]}\n')
	crawler(new_link)
	except:
	continue


	crawler('https://en.wikipedia.org')