11philip22/base.py

## base.py
# deze crawler pakt letterlijk alle linkjes en loopt daar over heen
# crached na een tijdje geen idee waarom

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import re
from urllib.parse import urlparse

website = "https://paranormalthoughtspodcast.wordpress.com/"
crawl = [website]
crawled = []

def getLinks(url):
	links = []
	try:
		html_page = urlopen(Request(url ,  headers={'User-Agent': 'Mozilla'}))
	except:
		print("oepsie")
		return
	soup = BeautifulSoup(html_page,"html5lib")
	for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
  		obj = urlparse(link.get('href'))
  		linkformat = 'https://{0}'.format(obj.hostname)
  		if linkformat not in crawl:
   			crawl.append(linkformat)
	return

for i in crawl:
	print(i)
	if i not in crawled:
 		getLinks(i)
 		crawled.append(i)

## crawler.py
# from bs4 import BeautifulSoup
# from urllib.request import urlopen
# import re
# from urllib.parse import urlparse
# website = "http://arstechnica.com"

# def getLinks(url):
#     html_page = urlopen(url)
#     soup = BeautifulSoup(html_page)
#     links = []

#     for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
#         links.append(link.get('href'))

#     return links

# for i in links:
# 	print(i)
# 	newlinks = getLinks(i)
# 	lijst[i] = False
# 	for j in newlinks:
# 		lijst[j] = False
# 		print(j)

# for k in links
# 	toparse = urlparse(k)
# 	parsed  = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
# 	print(k)

# from bs4 import BeautifulSoup
# from urllib.request import urlopen
# import re
# from urllib.parse import urlparse
# website = "http://arstechnica.com"

# links = {website: False}

# def getLinks(url):
#     html_page = urlopen(url)
#     soup = BeautifulSoup(html_page)
#     for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
#         key = link.get('href')
#         if key not in links:
#             links[key] = False
#     return links

# for key, crawled in links.items():
# 	if not crawled:
# 		print("Crawling: " + key)
# 		getLinks(key)
# 		links[key] = True

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from urllib.parse import urlparse
website = "http://arstechnica.com"

links = [website]
crawled = []

def getLinks(url):
    html_page = urlopen(url)
    soup = BeautifulSoup(html_page)
    for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
        links.append(link.get('href'))

    return links

for i in links:
	print(i)
	if i not in crawled:
		links.append(getLinks(i))
	crawled.append(i)

## raw.py
 # deze crawler pakt letterlijk alle linkjes en loopt daar over heen
# crached na een tijdje geen idee waarom

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from urllib.parse import urlparse

website = "http://arstechnica.com"

links = [website]
crawled = []

def getLinks(url):
    html_page = urlopen(url)
    soup = BeautifulSoup(html_page)
    for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
        links.append(link.get('href'))
    return links

for i in links:
	print(i)
	if i not in crawled:
		links.append(getLinks(i))
	crawled.append(i)
	# deze crawler pakt letterlijk alle linkjes en loopt daar over heen
	# crached na een tijdje geen idee waarom

	from bs4 import BeautifulSoup
	from urllib.request import urlopen, Request
	import re
	from urllib.parse import urlparse

	website = "https://paranormalthoughtspodcast.wordpress.com/"
	crawl = [website]
	crawled = []

	def getLinks(url):
	links = []
	try:
	html_page = urlopen(Request(url , headers={'User-Agent': 'Mozilla'}))
	except:
	print("oepsie")
	return
	soup = BeautifulSoup(html_page,"html5lib")
	for link in soup.findAll('a', attrs={'href': re.compile("^(http\|https)://")}):
	obj = urlparse(link.get('href'))
	linkformat = 'https://{0}'.format(obj.hostname)
	if linkformat not in crawl:
	crawl.append(linkformat)
	return

	for i in crawl:
	print(i)
	if i not in crawled:
	getLinks(i)
	crawled.append(i)
	# from bs4 import BeautifulSoup
	# from urllib.request import urlopen
	# import re
	# from urllib.parse import urlparse
	# website = "http://arstechnica.com"

	# def getLinks(url):
	# html_page = urlopen(url)
	# soup = BeautifulSoup(html_page)
	# links = []

	# for link in soup.findAll('a', attrs={'href': re.compile("^(http\|https)://")}):
	# links.append(link.get('href'))

	# return links

	# for i in links:
	# print(i)
	# newlinks = getLinks(i)
	# lijst[i] = False
	# for j in newlinks:
	# lijst[j] = False
	# print(j)

	# for k in links
	# toparse = urlparse(k)
	# parsed = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
	# print(k)

	# from bs4 import BeautifulSoup
	# from urllib.request import urlopen
	# import re
	# from urllib.parse import urlparse
	# website = "http://arstechnica.com"

	# links = {website: False}

	# def getLinks(url):
	# html_page = urlopen(url)
	# soup = BeautifulSoup(html_page)
	# for link in soup.findAll('a', attrs={'href': re.compile("^(http\|https)://")}):
	# key = link.get('href')
	# if key not in links:
	# links[key] = False
	# return links

	# for key, crawled in links.items():
	# if not crawled:
	# print("Crawling: " + key)
	# getLinks(key)
	# links[key] = True

	from bs4 import BeautifulSoup
	from urllib.request import urlopen
	import re
	from urllib.parse import urlparse
	website = "http://arstechnica.com"

	links = [website]
	crawled = []

	def getLinks(url):
	html_page = urlopen(url)
	soup = BeautifulSoup(html_page)
	for link in soup.findAll('a', attrs={'href': re.compile("^(http\|https)://")}):
	links.append(link.get('href'))

	return links

	for i in links:
	print(i)
	if i not in crawled:
	links.append(getLinks(i))
	crawled.append(i)