KTibow/regex_crawler.py

## regex_crawler.py
# Libs
import re
from urllib.request import urlopen
from urllib.parse import urlparse
yoursite = "https://www.google.com/"
crawllist = [{"url": yoursite, "crawled": False}]
undecodable = []
depth = 5
for i in range(depth):
    nlist = []
    for url in crawllist:
        if url not in nlist:
            nlist.append(url)
    crawllist = nlist
    # Message + copy list
    print("Depth", i, "/", depth)
    cwr = crawllist.copy()
    # Go through sites
    for i, url in enumerate(cwr):
        # Check if not crawled and message
        if not url["crawled"]:
            print("Crawling " + url["url"])
            try:
                # Download and parse
                resy = urlopen(url["url"]).read().decode()
                links = list(set(re.findall(r"(?i)(?::|src|href|background)(?: ?=)? ?\"(?:http|https)?(?::\/\/" +
                                            urlparse(yoursite).netloc.replace(".", "\.") +
                                            ")?(\/?[\w\d\/\.]{2}[\w\d\/\.]{0,50})\"",
                                            resy)))
                for j, l in enumerate(links):
                    # Go through links and test
                    try:
                        ressy = urlopen(url["url"] + l)
                        if {"url": url["url"] + l, "crawled": False} not in crawllist:
                            toCrawl = False
                            try:
                                ressy.read().decode()
                                toCrawl = True
                            except UnicodeDecodeError:
                                undecodable.append(url["url"] + l)
                            crawllist.append({"url": url["url"] + l, "crawled": not toCrawl})
                        print("======== Found " + url["url"] + l + " in " + url["url"])
                    except Exception as e:
                        try:
                            ressy = urlopen(url["url"] + "/" + l)
                            if {"url": url["url"] + "/" + l, "crawled": False} not in crawllist:
                                toCrawl = False
                                try:
                                    ressy.read().decode()
                                    toCrawl = True
                                except UnicodeDecodeError:
                                    undecodable.append(url["url"] + "/" + l)
                                crawllist.append({"url": url["url"] + "/" + l, "crawled": not toCrawl})
                            print("======== Found " + url["url"] + "/" + l + " in " + url["url"] + "/")
                        except Exception as e:
                            try:
                                baseurl = "/".join(url["url"].split("/")[0: len(url["url"].split("/")) - 1])
                                ressy = urlopen(baseurl + "/" + l)
                                if {"url": baseurl + "/" + l, "crawled": False} not in crawllist:
                                    toCrawl = False
                                    try:
                                        ressy.read().decode()
                                        toCrawl = True
                                    except UnicodeDecodeError:
                                        undecodable.append(baseurl + "/" + l)
                                    crawllist.append({"url": baseurl + "/" + l, "crawled": not toCrawl})
                                print("======== Found " + baseurl + "/" + l + " in " + baseurl + "/")
                            except Exception as e:
                                print("======== :/", l)
                    print("========[" + ((j + 1) * "#") + ((len(links)-j) * " ") + "]")
            except UnicodeDecodeError:
                print("UnicodeDecodeError")
            except Exception as e:
                print(e)
        print("[" + ((i + 1) * "#") + ((len(cwr) - i) * " ") + "]")
        url["crawled"] = True
urllist = []
for url in crawllist:
    if url["url"] not in urllist:
        urllist.append(url["url"])
print("URLs:")
for url in urllist:
    print(url)
print("Undecodable:")
for url in list(set(undecodable)):
    print(url)
	# Libs
	import re
	from urllib.request import urlopen
	from urllib.parse import urlparse
	yoursite = "https://www.google.com/"
	crawllist = [{"url": yoursite, "crawled": False}]
	undecodable = []
	depth = 5
	for i in range(depth):
	nlist = []
	for url in crawllist:
	if url not in nlist:
	nlist.append(url)
	crawllist = nlist
	# Message + copy list
	print("Depth", i, "/", depth)
	cwr = crawllist.copy()
	# Go through sites
	for i, url in enumerate(cwr):
	# Check if not crawled and message
	if not url["crawled"]:
	print("Crawling " + url["url"])
	try:
	# Download and parse
	resy = urlopen(url["url"]).read().decode()
	links = list(set(re.findall(r"(?i)(?::\|src\|href\|background)(?: ?=)? ?\"(?:http\|https)?(?::\/\/" +
	urlparse(yoursite).netloc.replace(".", "\.") +
	")?(\/?[\w\d\/\.]{2}[\w\d\/\.]{0,50})\"",
	resy)))
	for j, l in enumerate(links):
	# Go through links and test
	try:
	ressy = urlopen(url["url"] + l)
	if {"url": url["url"] + l, "crawled": False} not in crawllist:
	toCrawl = False
	try:
	ressy.read().decode()
	toCrawl = True
	except UnicodeDecodeError:
	undecodable.append(url["url"] + l)
	crawllist.append({"url": url["url"] + l, "crawled": not toCrawl})
	print("======== Found " + url["url"] + l + " in " + url["url"])
	except Exception as e:
	try:
	ressy = urlopen(url["url"] + "/" + l)
	if {"url": url["url"] + "/" + l, "crawled": False} not in crawllist:
	toCrawl = False
	try:
	ressy.read().decode()
	toCrawl = True
	except UnicodeDecodeError:
	undecodable.append(url["url"] + "/" + l)
	crawllist.append({"url": url["url"] + "/" + l, "crawled": not toCrawl})
	print("======== Found " + url["url"] + "/" + l + " in " + url["url"] + "/")
	except Exception as e:
	try:
	baseurl = "/".join(url["url"].split("/")[0: len(url["url"].split("/")) - 1])
	ressy = urlopen(baseurl + "/" + l)
	if {"url": baseurl + "/" + l, "crawled": False} not in crawllist:
	toCrawl = False
	try:
	ressy.read().decode()
	toCrawl = True
	except UnicodeDecodeError:
	undecodable.append(baseurl + "/" + l)
	crawllist.append({"url": baseurl + "/" + l, "crawled": not toCrawl})
	print("======== Found " + baseurl + "/" + l + " in " + baseurl + "/")
	except Exception as e:
	print("======== :/", l)
	print("========[" + ((j + 1) * "#") + ((len(links)-j) * " ") + "]")
	except UnicodeDecodeError:
	print("UnicodeDecodeError")
	except Exception as e:
	print(e)
	print("[" + ((i + 1) * "#") + ((len(cwr) - i) * " ") + "]")
	url["crawled"] = True
	urllist = []
	for url in crawllist:
	if url["url"] not in urllist:
	urllist.append(url["url"])
	print("URLs:")
	for url in urllist:
	print(url)
	print("Undecodable:")
	for url in list(set(undecodable)):
	print(url)