Skip to content

Instantly share code, notes, and snippets.

@11philip22
Last active July 19, 2019 23:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 11philip22/970bbf31cb407e4839338798c753d9f4 to your computer and use it in GitHub Desktop.
Save 11philip22/970bbf31cb407e4839338798c753d9f4 to your computer and use it in GitHub Desktop.
python webcrawler i wrote to learn python
# deze crawler pakt letterlijk alle linkjes en loopt daar over heen
# crached na een tijdje geen idee waarom
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import re
from urllib.parse import urlparse
website = "https://paranormalthoughtspodcast.wordpress.com/"
crawl = [website]
crawled = []
def getLinks(url):
links = []
try:
html_page = urlopen(Request(url , headers={'User-Agent': 'Mozilla'}))
except:
print("oepsie")
return
soup = BeautifulSoup(html_page,"html5lib")
for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
obj = urlparse(link.get('href'))
linkformat = 'https://{0}'.format(obj.hostname)
if linkformat not in crawl:
crawl.append(linkformat)
return
for i in crawl:
print(i)
if i not in crawled:
getLinks(i)
crawled.append(i)
# from bs4 import BeautifulSoup
# from urllib.request import urlopen
# import re
# from urllib.parse import urlparse
# website = "http://arstechnica.com"
# def getLinks(url):
# html_page = urlopen(url)
# soup = BeautifulSoup(html_page)
# links = []
# for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
# links.append(link.get('href'))
# return links
# for i in links:
# print(i)
# newlinks = getLinks(i)
# lijst[i] = False
# for j in newlinks:
# lijst[j] = False
# print(j)
# for k in links
# toparse = urlparse(k)
# parsed = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
# print(k)
# from bs4 import BeautifulSoup
# from urllib.request import urlopen
# import re
# from urllib.parse import urlparse
# website = "http://arstechnica.com"
# links = {website: False}
# def getLinks(url):
# html_page = urlopen(url)
# soup = BeautifulSoup(html_page)
# for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
# key = link.get('href')
# if key not in links:
# links[key] = False
# return links
# for key, crawled in links.items():
# if not crawled:
# print("Crawling: " + key)
# getLinks(key)
# links[key] = True
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from urllib.parse import urlparse
website = "http://arstechnica.com"
links = [website]
crawled = []
def getLinks(url):
html_page = urlopen(url)
soup = BeautifulSoup(html_page)
for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
links.append(link.get('href'))
return links
for i in links:
print(i)
if i not in crawled:
links.append(getLinks(i))
crawled.append(i)
# deze crawler pakt letterlijk alle linkjes en loopt daar over heen
# crached na een tijdje geen idee waarom
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from urllib.parse import urlparse
website = "http://arstechnica.com"
links = [website]
crawled = []
def getLinks(url):
html_page = urlopen(url)
soup = BeautifulSoup(html_page)
for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
links.append(link.get('href'))
return links
for i in links:
print(i)
if i not in crawled:
links.append(getLinks(i))
crawled.append(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment