Skip to content

Instantly share code, notes, and snippets.

@otavio-s-s
Last active December 3, 2020 20:56
Show Gist options
  • Save otavio-s-s/8fb5502d653e5f583b5b35593f17a5b1 to your computer and use it in GitHub Desktop.
Save otavio-s-s/8fb5502d653e5f583b5b35593f17a5b1 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import numpy as np
import re
import time
np.random.seed(int(time.time()))
pages_crawled = []
def random_crawler(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.find_all('a', {'href': re.compile('^\/wiki\/((?!:).)*$')})
link = links[np.random.randint(1, len(links) + 1)]
if link not in pages_crawled:
pages_crawled.append(link)
new_link = f"https://en.wikipedia.org{link['href']}"
try:
with open('data.csv', 'a') as file:
file.write(f'{soup.title.text}; {soup.h1.text}; {link["href"]}\n')
random_crawler(new_link)
except:
continue
random_crawler('https://en.wikipedia.org')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment