Skip to content

Instantly share code, notes, and snippets.

@lewis-carson
Created November 6, 2017 17:16
Show Gist options
  • Save lewis-carson/90c81f7fa5dbd11985a680dc182dbfef to your computer and use it in GitHub Desktop.
Save lewis-carson/90c81f7fa5dbd11985a680dc182dbfef to your computer and use it in GitHub Desktop.
lightweight web crawler
from bs4 import BeautifulSoup
import random
import requests
import re
class crawler():
def findurls(self, limit, seed):
cache = seed
history = []
for idx, currenturl in enumerate(cache):
try:
r = requests.get(currenturl)
data = r.text
soup = BeautifulSoup(data, "lxml")
for a in soup.find_all('a', href=True):
link = a['href']
if link not in history:
cache.append(link)
history.append(currenturl)
if len(history) == limit:
return history
except:
t = ""
def searchwithseed(self, limit, seed, word):
cache = seed
history = []
found = []
for idx, currenturl in enumerate(cache):
try:
r = requests.get(currenturl)
data = r.text
soup = BeautifulSoup(data, "lxml")
for a in soup.find_all('a', href=True):
link = a['href']
if link not in history:
cache.append(link)
if word in data:
found.append(currenturl)
history.append(currenturl)
if len(history) == limit:
return found
except:
t = ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment