Skip to content

Instantly share code, notes, and snippets.

@chidimo
Last active February 4, 2018 07:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chidimo/17da59ebf470d741c057 to your computer and use it in GitHub Desktop.
Save chidimo/17da59ebf470d741c057 to your computer and use it in GitHub Desktop.
Functions to index the web. To use, just modify seedPage and n. The seed page and the depth of index you want. You have to see the output in a text file. Its a python dictionary dumped as json
from collections import OrderedDict
from operator import itemgetter
import pprint
import requests
import bs4
import json
import time
import re
# returns a BS object of a webpage
def makeSoup(pageUrl):
connTimeout = 10.0
readTimeout = 10.0
errorTrack = {}
errorTrack['CECount'] = 0
errorTrack['TECount'] = 0
errorTrack['HECount'] = 0
errorTrack['MSECount'] = 0
errorTrack['IUECount'] = 0
errorTrack['RTECount'] = 0
errorTrack['TMRECount'] = 0
try:
r = requests.get(pageUrl, timeout=(connTimeout, readTimeout))
pageString = r.text
return bs4.BeautifulSoup(pageString, 'html.parser'), errorTrack
try:
r.raise_for_status()
except requests.exceptions.HTTPError as e:
errorTrack['HECount'] += 1
except requests.exceptions.ConnectionError as CE:
errorTrack['CECount'] += 1
except requests.exceptions.ConnectTimeout as TE:
errorTrack['TECount'] += 1
except requests.exceptions.HTTPError as HE:
errorTrack['HECount'] += 1
except requests.exceptions.MissingSchema as MSE:
errorTrack['MSECount'] += 1
except requests.exceptions.InvalidURL as IUE:
errorTrack['IUECount'] += 1
except requests.exceptions.ReadTimeout as RTE:
errorTrack['RTECount'] += 1
except requests.exceptions.TooManyRedirects as TMRE:
errorTrack['TMRECount'] += 1
return None
# returns all href links on a page
def allLinks(pageUrl):
try:
pageSoup, errorTrack = makeSoup(pageUrl)
return pageSoup.findAll('a', href=True)
except TypeError as e:
return None
# returns all http links on a page
def crawlableLinks(pageUrl):
links = allLinks(pageUrl)
goodLinks = []
try:
for link in links:
text = link.text
dest = link.get('href')
if dest.startswith('http'):
goodLinks.append(dest)
return goodLinks
except TypeError as e:
return None
# returns all http links on a page
# together with their associated contents
def crawlableLinksWithCont(pageUrl):
links = allLinks(pageUrl)
goodLinks = []
goodLinksText = []
for link in links:
text = link.text
cont = link.contents
dest = link.get('href')
if dest.startswith('http'):
goodLinks.append(dest)
goodLinksText.append(cont)
return goodLinks, goodLinksText
# returns the index of a word in a list
def findWord(someList, word):
try:
someList.index(word)
return someList.index(word)
except: ValueError
return -1
# returns the index of a word in a list
def findWord(someList, word):
if word in someList:
return someList.index(word)
else:
return -1
# merges two lists: common elements are not replicated
def Union(list1, list2):
try:
for elem in list2:
if elem not in list1:
list1.append(elem)
return list1
except TypeError as e:
return list1
# returns a list of all words in paragraph tags
def wordList(pageUrl):
try:
pageSoup, errorTrack = makeSoup(pageUrl)
parText = pageSoup.findAll('p')
pars = []
for i in range(len(parText)):
indText = parText[i].text
for word in re.split('[; , \* \n \. ]', indText):
# for word in indText.split():
pars.append(word)
return pars
except TypeError as e:
return None
# Indexing a word: dictionary implementation
# no replication of links
def addToIndex(webIndex, keyword, pageUrl):
if keyword in webIndex:
if pageUrl in webIndex[keyword]:
return
webIndex[keyword].append(pageUrl)
return
webIndex[keyword] = [pageUrl]
# Indexing the contents of a webpage
def addPageToIndex(webIndex, pageUrl):
allWords = wordList(pageUrl)
try:
for word in allWords:
addToIndex(webIndex, word, pageUrl)
except TypeError as e:
return None
def buildIndex(pageUrl, n):
toCrawl = [pageUrl]
nextToCrawl = []
crawled = []
webIndex = {}
linkGraph = {}
badLinks = []
for j in range(n):
print(len(toCrawl), ' links being indexed')
for i in range(len(toCrawl)):
print('step: ', i + 1, ' of run ', j+1)
if toCrawl[i] not in crawled:
outLinks = crawlableLinks(toCrawl[i])
else:
continue
if outLinks == None:
badLinks.append(toCrawl[i])
continue
else:
Union(nextToCrawl, outLinks)
linkGraph[toCrawl[i]] = outLinks
addPageToIndex(webIndex, toCrawl[i])
crawled.append(toCrawl[i])
toCrawl = nextToCrawl
print(len(nextToCrawl), 'links available for next crawl')
return webIndex, linkGraph, badLinks, crawled
def computeRanks(linkGraph):
d = 0.8 # damping constant
numLoops = 10 # I will modify this
ranks = {}
nPages = len(linkGraph)
for page in linkGraph:
ranks[page] = 1.0 / nPages
for i in range(0, numLoops):
newRanks = {}
for page in linkGraph:
newRank = (1-d) / nPages
for node in linkGraph:
if page in linkGraph[node]:
newRank = newRank + d * (ranks[node] / len(linkGraph[node]))
newRanks[page] = newRank
ranks = newRanks
return ranks
def lookUpBest(webIndex, ranks, keyword):
matches = {}
if keyword in webIndex:
for url in webIndex[keyword]:
matches[url] = ranks[url]
return dictSortByValue(matches)
return None
def dictSortByValue(someDict):
if someDict == None:
return
order = OrderedDict(sorted(someDict.items(), key = itemgetter(1)))
keys = order.keys()
values = order.values()
keyList = list(keys)
valueList = list(values)
valueList.reverse()
keyList.reverse()
return keyList
t1 = time.clock()
seedPage = 'http://coursera.org'
n = 2
Index, Graph, BadLink, crawledUrls = buildIndex(seedPage, n)
print('Done Indexing')
print('Indexing time: ', time.clock() - t1, 'sec')
print(len(Index), ' words indexed')
print(len(crawledUrls), ' good links indexed')
print(len(BadLink), 'Bad Links: ')
print('Now computing url ranks')
t2 = time.clock()
ranking = computeRanks(Graph)
print('Ranks computed in ', time.clock() - t2, 'sec')
fHand = open('webIndex.txt', 'w')
json.dump(Index, fHand)
fHand.close()
fHand2 = open('linkGraph.txt', 'w')
json.dump(Graph, fHand2)
fHand2.close()
while True:
searchTerm = input('\nEnter search term: ')
if not searchTerm:
break
t3 = time.clock()
results = lookUpBest(Index, ranking, searchTerm)
if results == None:
print('No results found for your search')
continue
print('\n')
print(len(results), 'matches found in ', time.clock() - t3, ' sec')
pprint.pprint(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment