Skip to content

Instantly share code, notes, and snippets.

@otavio-s-s
Last active December 3, 2020 20:56
Show Gist options
  • Save otavio-s-s/5a4cea87c2b8702af9fe31da0fa9d03e to your computer and use it in GitHub Desktop.
Save otavio-s-s/5a4cea87c2b8702af9fe31da0fa9d03e to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
pages_crawled = []
def crawler(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.find_all('a')
for link in links:
if 'href' in link.attrs:
if link['href'].startswith('/wiki') and ':' not in link['href']:
if link['href'] not in pages_crawled:
new_link = f"https://en.wikipedia.org{link['href']}"
pages_crawled.append(link['href'])
try:
with open('data.csv', 'a') as file:
file.write(f'{soup.title.text}; {soup.h1.text}; {link["href"]}\n')
crawler(new_link)
except:
continue
crawler('https://en.wikipedia.org')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment