Last active February 4, 2018 07:14
Functions to index the web. To use, just modify seedPage and n. The seed page and the depth of index you want. You have to see the output in a text file. Its a python dictionary dumped as json
from collections import OrderedDict
from operator import itemgetter
import pprint
import requests
import bs4
import json
import time
import re
# returns a BS object of a webpage
def makeSoup(pageUrl):
connTimeout = 10.0
readTimeout = 10.0
errorTrack = {}
errorTrack['CECount'] = 0
errorTrack['TECount'] = 0
errorTrack['HECount'] = 0
errorTrack['MSECount'] = 0
errorTrack['IUECount'] = 0
errorTrack['RTECount'] = 0
errorTrack['TMRECount'] = 0
r = requests.get(pageUrl, timeout=(connTimeout, readTimeout))
pageString = r.text
return bs4.BeautifulSoup(pageString, 'html.parser'), errorTrack
except requests.exceptions.HTTPError as e:
errorTrack['HECount'] += 1
except requests.exceptions.ConnectionError as CE:
errorTrack['CECount'] += 1
except requests.exceptions.ConnectTimeout as TE:
errorTrack['TECount'] += 1
except requests.exceptions.HTTPError as HE:
errorTrack['HECount'] += 1
except requests.exceptions.MissingSchema as MSE:
errorTrack['MSECount'] += 1
except requests.exceptions.InvalidURL as IUE:
errorTrack['IUECount'] += 1
except requests.exceptions.ReadTimeout as RTE:
errorTrack['RTECount'] += 1
except requests.exceptions.TooManyRedirects as TMRE:
errorTrack['TMRECount'] += 1
return None
# returns all href links on a page
def allLinks(pageUrl):
pageSoup, errorTrack = makeSoup(pageUrl)
return pageSoup.findAll('a', href=True)
except TypeError as e:
return None
# returns all http links on a page
def crawlableLinks(pageUrl):
links = allLinks(pageUrl)
goodLinks = []
for link in links:
text = link.text
dest = link.get('href')
if dest.startswith('http'):
return goodLinks
except TypeError as e:
return None
# returns all http links on a page
# together with their associated contents
def crawlableLinksWithCont(pageUrl):
links = allLinks(pageUrl)
goodLinks = []
goodLinksText = []
for link in links:
text = link.text
cont = link.contents
dest = link.get('href')
if dest.startswith('http'):
return goodLinks, goodLinksText
# returns the index of a word in a list
def findWord(someList, word):
return someList.index(word)
except: ValueError
return -1
# returns the index of a word in a list
def findWord(someList, word):
if word in someList:
return someList.index(word)
return -1
# merges two lists: common elements are not replicated
def Union(list1, list2):
for elem in list2:
if elem not in list1:
return list1
except TypeError as e:
return list1
# returns a list of all words in paragraph tags
def wordList(pageUrl):
pageSoup, errorTrack = makeSoup(pageUrl)
parText = pageSoup.findAll('p')
pars = []
for i in range(len(parText)):
indText = parText[i].text
for word in re.split('[; , \* \n \. ]', indText):
# for word in indText.split():
return pars
except TypeError as e:
return None
# Indexing a word: dictionary implementation
# no replication of links
def addToIndex(webIndex, keyword, pageUrl):
if keyword in webIndex:
if pageUrl in webIndex[keyword]:
webIndex[keyword] = [pageUrl]
# Indexing the contents of a webpage
def addPageToIndex(webIndex, pageUrl):
allWords = wordList(pageUrl)
for word in allWords:
addToIndex(webIndex, word, pageUrl)
except TypeError as e:
return None
def buildIndex(pageUrl, n):
toCrawl = [pageUrl]
nextToCrawl = []
crawled = []
webIndex = {}
linkGraph = {}
badLinks = []
for j in range(n):
print(len(toCrawl), ' links being indexed')
for i in range(len(toCrawl)):
print('step: ', i + 1, ' of run ', j+1)
if toCrawl[i] not in crawled:
outLinks = crawlableLinks(toCrawl[i])
if outLinks == None:
Union(nextToCrawl, outLinks)
linkGraph[toCrawl[i]] = outLinks
addPageToIndex(webIndex, toCrawl[i])
toCrawl = nextToCrawl
print(len(nextToCrawl), 'links available for next crawl')
return webIndex, linkGraph, badLinks, crawled
def computeRanks(linkGraph):
d = 0.8 # damping constant
numLoops = 10 # I will modify this
ranks = {}
nPages = len(linkGraph)
for page in linkGraph:
ranks[page] = 1.0 / nPages
for i in range(0, numLoops):
newRanks = {}
for page in linkGraph:
newRank = (1-d) / nPages
for node in linkGraph:
if page in linkGraph[node]:
newRank = newRank + d * (ranks[node] / len(linkGraph[node]))
newRanks[page] = newRank
ranks = newRanks
return ranks
def lookUpBest(webIndex, ranks, keyword):
matches = {}
if keyword in webIndex:
for url in webIndex[keyword]:
matches[url] = ranks[url]
return dictSortByValue(matches)
return None
def dictSortByValue(someDict):
if someDict == None:
order = OrderedDict(sorted(someDict.items(), key = itemgetter(1)))
keys = order.keys()
values = order.values()
keyList = list(keys)
valueList = list(values)
return keyList
t1 = time.clock()
seedPage = ''
n = 2
Index, Graph, BadLink, crawledUrls = buildIndex(seedPage, n)
print('Done Indexing')
print('Indexing time: ', time.clock() - t1, 'sec')
print(len(Index), ' words indexed')
print(len(crawledUrls), ' good links indexed')
print(len(BadLink), 'Bad Links: ')
print('Now computing url ranks')
t2 = time.clock()
ranking = computeRanks(Graph)
print('Ranks computed in ', time.clock() - t2, 'sec')
fHand = open('webIndex.txt', 'w')
json.dump(Index, fHand)
fHand2 = open('linkGraph.txt', 'w')
json.dump(Graph, fHand2)
while True:
searchTerm = input('\nEnter search term: ')
if not searchTerm:
t3 = time.clock()
results = lookUpBest(Index, ranking, searchTerm)
if results == None:
print('No results found for your search')
print(len(results), 'matches found in ', time.clock() - t3, ' sec')
