Skip to content

Instantly share code, notes, and snippets.

@GnsP
Created March 14, 2015 06:00
Show Gist options
  • Save GnsP/5c50d23eeeaf5390d420 to your computer and use it in GitHub Desktop.
Save GnsP/5c50d23eeeaf5390d420 to your computer and use it in GitHub Desktop.
wikipedia crawler and downloader
#!/usr/bin/python3
'''
This script is a web crawler (spider) that searches the wikipedia pages on a
given keyword recursively and returns only relevant pages that contain relevant
info on the given topic in their contents section. It also creates a directory
under the present working directory and saves the webpages for further reading.
It also builds a graph representation of the dependencies among the webpages to
show the user how the pages and the fields are interlinked.
It runs on python 3.x . Natively built on Linux. '''
__author__ = 'Ganesh Prasad Sahoo, email: sir.gnsp@gmail.com'
__version__ = '0.1 beta'
import http.client as httplib
import re
import os
import optparse
import urllib.request as urllib2
import urllib.parse as urlparse
import json
import threading
MULTIPLE_RESULTS_SET = False
class wikiCrawler:
def __init__(self):
self._visited = set()
self._topic = input("Enter the Search String :")
self._dir_name = '_'.join(self._topic.split())
self._search_depth = int(input("Enter Search Depth :"))
self._unnamed_counter = 0
self._url = "http://en.wikipedia.org"
def getURL(self):
srch_str = '+'.join(self._topic.split())+'+inurl:en.wikipedia.org'
query = urlparse.urlencode({'q':srch_str})
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query
search_response = urllib2.urlopen(url)
search_results = search_response.read().decode()
print(search_results)
results = json.loads(search_results)
data = results['responseData']
hits = data['results']
if len(hits) > 0:
self._url = hits[0]['url']
return [str(h['url']) for h in hits]
def initDirectory(self):
os.mkdir(self._dir_name)
os.chdir(self._dir_name)
def saveData(self,filename,content):
f = open(filename,"w+")
f.write(content)
f.close()
def getMainDiv(self,content):
start = content.find('div id="mw-content-text"')
counter = 0
index = start+23
while counter > -1:
if content[index:index+5] == '<div ':
counter += 1
index += 5
elif content[index:index+5] == '</div':
counter -= 1
index += 5
else:
index += 1
return content[start:index]
def crawlWiki(self, url, depth=1):
if url.startswith("http://") and not url in self._visited:
self._visited.add(url)
url = url.replace("http://","",1)
print('visiting :'+url)
host = url
path="/"
filename = ''
urlSegments = url.split("/")
if len(urlSegments) > 1:
host = urlSegments[0]
path = url.replace(host,"",1)
filename = urlSegments[-1].split('.')[0]+'.html'
else:
filename = 'default_'+str(self._unnamed_counter)+'.html'
self._unnamed_counter += 1
conn = httplib.HTTPConnection(host)
req = conn.request("GET",path)
try:
res = conn.getresponse()
except:
return
try:
content = res.read().decode()
except:
return
self.saveData(filename,content)
m = re.findall('href="(.*?)"', self.getMainDiv(content))
threadList = []
for link in m:
if link.startswith('/'):
link = 'http://'+host+link
if depth > 0:
if len(threadList) < 50:
t = threading.Thread(target=self.crawlWiki,args=(link,depth-1,))
t.daemon = True
threadList.append(t)
t.start()
else:
self.crawlWiki(link,depth-1)
for t in threadList:
t.join()
else:
#url is not http
pass
def finalizeDirectory(self):
os.chdir('..')
def run(self):
self.initDirectory()
crawlList = self.getURL()
if MULTIPLE_RESULTS_SET:
threads=[]
for url in crawlList:
t = threading.Thread(target=self.crawlWiki,args=(url,self._search_depth,))
t.daemon = True
threads.append(t)
t.start()
#self.crawlWiki(url,self._search_depth)
for t in threads:
t.join()
else:
self.crawlWiki(crawlList[0],self._search_depth)
self.finalizeDirectory()
if __name__=='__main__':
app = wikiCrawler()
app.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment