otavio-s-s/random_crawler.py

## random_crawler.py
from bs4 import BeautifulSoup
import requests
import numpy as np
import re
import time

np.random.seed(int(time.time()))
pages_crawled = []


def random_crawler(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    links = soup.find_all('a', {'href': re.compile('^\/wiki\/((?!:).)*$')})
    link = links[np.random.randint(1, len(links) + 1)]

    if link not in pages_crawled:
        pages_crawled.append(link)
        new_link = f"https://en.wikipedia.org{link['href']}"
        try:
            with open('data.csv', 'a') as file:
                file.write(f'{soup.title.text}; {soup.h1.text}; {link["href"]}\n')
            random_crawler(new_link)
        except:
            continue


random_crawler('https://en.wikipedia.org')
	from bs4 import BeautifulSoup
	import requests
	import numpy as np
	import re
	import time

	np.random.seed(int(time.time()))
	pages_crawled = []


	def random_crawler(url):
	page = requests.get(url)
	soup = BeautifulSoup(page.text, 'html.parser')
	links = soup.find_all('a', {'href': re.compile('^\/wiki\/((?!:).)*$')})
	link = links[np.random.randint(1, len(links) + 1)]

	if link not in pages_crawled:
	pages_crawled.append(link)
	new_link = f"https://en.wikipedia.org{link['href']}"
	try:
	with open('data.csv', 'a') as file:
	file.write(f'{soup.title.text}; {soup.h1.text}; {link["href"]}\n')
	random_crawler(new_link)
	except:
	continue


	random_crawler('https://en.wikipedia.org')