Skip to content

Instantly share code, notes, and snippets.

@KTibow
Last active June 20, 2020 17:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KTibow/f7a5fd39f8a7fe0126b049486a6e20f3 to your computer and use it in GitHub Desktop.
Save KTibow/f7a5fd39f8a7fe0126b049486a6e20f3 to your computer and use it in GitHub Desktop.
Web crawler in Python with Regex!
# Libs
import re
from urllib.request import urlopen
from urllib.parse import urlparse
yoursite = "https://www.google.com/"
crawllist = [{"url": yoursite, "crawled": False}]
undecodable = []
depth = 5
for i in range(depth):
nlist = []
for url in crawllist:
if url not in nlist:
nlist.append(url)
crawllist = nlist
# Message + copy list
print("Depth", i, "/", depth)
cwr = crawllist.copy()
# Go through sites
for i, url in enumerate(cwr):
# Check if not crawled and message
if not url["crawled"]:
print("Crawling " + url["url"])
try:
# Download and parse
resy = urlopen(url["url"]).read().decode()
links = list(set(re.findall(r"(?i)(?::|src|href|background)(?: ?=)? ?\"(?:http|https)?(?::\/\/" +
urlparse(yoursite).netloc.replace(".", "\.") +
")?(\/?[\w\d\/\.]{2}[\w\d\/\.]{0,50})\"",
resy)))
for j, l in enumerate(links):
# Go through links and test
try:
ressy = urlopen(url["url"] + l)
if {"url": url["url"] + l, "crawled": False} not in crawllist:
toCrawl = False
try:
ressy.read().decode()
toCrawl = True
except UnicodeDecodeError:
undecodable.append(url["url"] + l)
crawllist.append({"url": url["url"] + l, "crawled": not toCrawl})
print("======== Found " + url["url"] + l + " in " + url["url"])
except Exception as e:
try:
ressy = urlopen(url["url"] + "/" + l)
if {"url": url["url"] + "/" + l, "crawled": False} not in crawllist:
toCrawl = False
try:
ressy.read().decode()
toCrawl = True
except UnicodeDecodeError:
undecodable.append(url["url"] + "/" + l)
crawllist.append({"url": url["url"] + "/" + l, "crawled": not toCrawl})
print("======== Found " + url["url"] + "/" + l + " in " + url["url"] + "/")
except Exception as e:
try:
baseurl = "/".join(url["url"].split("/")[0: len(url["url"].split("/")) - 1])
ressy = urlopen(baseurl + "/" + l)
if {"url": baseurl + "/" + l, "crawled": False} not in crawllist:
toCrawl = False
try:
ressy.read().decode()
toCrawl = True
except UnicodeDecodeError:
undecodable.append(baseurl + "/" + l)
crawllist.append({"url": baseurl + "/" + l, "crawled": not toCrawl})
print("======== Found " + baseurl + "/" + l + " in " + baseurl + "/")
except Exception as e:
print("======== :/", l)
print("========[" + ((j + 1) * "#") + ((len(links)-j) * " ") + "]")
except UnicodeDecodeError:
print("UnicodeDecodeError")
except Exception as e:
print(e)
print("[" + ((i + 1) * "#") + ((len(cwr) - i) * " ") + "]")
url["crawled"] = True
urllist = []
for url in crawllist:
if url["url"] not in urllist:
urllist.append(url["url"])
print("URLs:")
for url in urllist:
print(url)
print("Undecodable:")
for url in list(set(undecodable)):
print(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment