Last active
July 19, 2019 23:55
-
-
Save 11philip22/970bbf31cb407e4839338798c753d9f4 to your computer and use it in GitHub Desktop.
python webcrawler i wrote to learn python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# deze crawler pakt letterlijk alle linkjes en loopt daar over heen | |
# crached na een tijdje geen idee waarom | |
from bs4 import BeautifulSoup | |
from urllib.request import urlopen, Request | |
import re | |
from urllib.parse import urlparse | |
website = "https://paranormalthoughtspodcast.wordpress.com/" | |
crawl = [website] | |
crawled = [] | |
def getLinks(url): | |
links = [] | |
try: | |
html_page = urlopen(Request(url , headers={'User-Agent': 'Mozilla'})) | |
except: | |
print("oepsie") | |
return | |
soup = BeautifulSoup(html_page,"html5lib") | |
for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}): | |
obj = urlparse(link.get('href')) | |
linkformat = 'https://{0}'.format(obj.hostname) | |
if linkformat not in crawl: | |
crawl.append(linkformat) | |
return | |
for i in crawl: | |
print(i) | |
if i not in crawled: | |
getLinks(i) | |
crawled.append(i) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from bs4 import BeautifulSoup | |
# from urllib.request import urlopen | |
# import re | |
# from urllib.parse import urlparse | |
# website = "http://arstechnica.com" | |
# def getLinks(url): | |
# html_page = urlopen(url) | |
# soup = BeautifulSoup(html_page) | |
# links = [] | |
# for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}): | |
# links.append(link.get('href')) | |
# return links | |
# for i in links: | |
# print(i) | |
# newlinks = getLinks(i) | |
# lijst[i] = False | |
# for j in newlinks: | |
# lijst[j] = False | |
# print(j) | |
# for k in links | |
# toparse = urlparse(k) | |
# parsed = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) | |
# print(k) | |
# from bs4 import BeautifulSoup | |
# from urllib.request import urlopen | |
# import re | |
# from urllib.parse import urlparse | |
# website = "http://arstechnica.com" | |
# links = {website: False} | |
# def getLinks(url): | |
# html_page = urlopen(url) | |
# soup = BeautifulSoup(html_page) | |
# for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}): | |
# key = link.get('href') | |
# if key not in links: | |
# links[key] = False | |
# return links | |
# for key, crawled in links.items(): | |
# if not crawled: | |
# print("Crawling: " + key) | |
# getLinks(key) | |
# links[key] = True | |
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
import re | |
from urllib.parse import urlparse | |
website = "http://arstechnica.com" | |
links = [website] | |
crawled = [] | |
def getLinks(url): | |
html_page = urlopen(url) | |
soup = BeautifulSoup(html_page) | |
for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}): | |
links.append(link.get('href')) | |
return links | |
for i in links: | |
print(i) | |
if i not in crawled: | |
links.append(getLinks(i)) | |
crawled.append(i) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# deze crawler pakt letterlijk alle linkjes en loopt daar over heen | |
# crached na een tijdje geen idee waarom | |
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
import re | |
from urllib.parse import urlparse | |
website = "http://arstechnica.com" | |
links = [website] | |
crawled = [] | |
def getLinks(url): | |
html_page = urlopen(url) | |
soup = BeautifulSoup(html_page) | |
for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}): | |
links.append(link.get('href')) | |
return links | |
for i in links: | |
print(i) | |
if i not in crawled: | |
links.append(getLinks(i)) | |
crawled.append(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment