Created
March 14, 2015 06:00
-
-
Save GnsP/5c50d23eeeaf5390d420 to your computer and use it in GitHub Desktop.
wikipedia crawler and downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
''' | |
This script is a web crawler (spider) that searches the wikipedia pages on a | |
given keyword recursively and returns only relevant pages that contain relevant | |
info on the given topic in their contents section. It also creates a directory | |
under the present working directory and saves the webpages for further reading. | |
It also builds a graph representation of the dependencies among the webpages to | |
show the user how the pages and the fields are interlinked. | |
It runs on python 3.x . Natively built on Linux. ''' | |
__author__ = 'Ganesh Prasad Sahoo, email: sir.gnsp@gmail.com' | |
__version__ = '0.1 beta' | |
import http.client as httplib | |
import re | |
import os | |
import optparse | |
import urllib.request as urllib2 | |
import urllib.parse as urlparse | |
import json | |
import threading | |
MULTIPLE_RESULTS_SET = False | |
class wikiCrawler: | |
def __init__(self): | |
self._visited = set() | |
self._topic = input("Enter the Search String :") | |
self._dir_name = '_'.join(self._topic.split()) | |
self._search_depth = int(input("Enter Search Depth :")) | |
self._unnamed_counter = 0 | |
self._url = "http://en.wikipedia.org" | |
def getURL(self): | |
srch_str = '+'.join(self._topic.split())+'+inurl:en.wikipedia.org' | |
query = urlparse.urlencode({'q':srch_str}) | |
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query | |
search_response = urllib2.urlopen(url) | |
search_results = search_response.read().decode() | |
print(search_results) | |
results = json.loads(search_results) | |
data = results['responseData'] | |
hits = data['results'] | |
if len(hits) > 0: | |
self._url = hits[0]['url'] | |
return [str(h['url']) for h in hits] | |
def initDirectory(self): | |
os.mkdir(self._dir_name) | |
os.chdir(self._dir_name) | |
def saveData(self,filename,content): | |
f = open(filename,"w+") | |
f.write(content) | |
f.close() | |
def getMainDiv(self,content): | |
start = content.find('div id="mw-content-text"') | |
counter = 0 | |
index = start+23 | |
while counter > -1: | |
if content[index:index+5] == '<div ': | |
counter += 1 | |
index += 5 | |
elif content[index:index+5] == '</div': | |
counter -= 1 | |
index += 5 | |
else: | |
index += 1 | |
return content[start:index] | |
def crawlWiki(self, url, depth=1): | |
if url.startswith("http://") and not url in self._visited: | |
self._visited.add(url) | |
url = url.replace("http://","",1) | |
print('visiting :'+url) | |
host = url | |
path="/" | |
filename = '' | |
urlSegments = url.split("/") | |
if len(urlSegments) > 1: | |
host = urlSegments[0] | |
path = url.replace(host,"",1) | |
filename = urlSegments[-1].split('.')[0]+'.html' | |
else: | |
filename = 'default_'+str(self._unnamed_counter)+'.html' | |
self._unnamed_counter += 1 | |
conn = httplib.HTTPConnection(host) | |
req = conn.request("GET",path) | |
try: | |
res = conn.getresponse() | |
except: | |
return | |
try: | |
content = res.read().decode() | |
except: | |
return | |
self.saveData(filename,content) | |
m = re.findall('href="(.*?)"', self.getMainDiv(content)) | |
threadList = [] | |
for link in m: | |
if link.startswith('/'): | |
link = 'http://'+host+link | |
if depth > 0: | |
if len(threadList) < 50: | |
t = threading.Thread(target=self.crawlWiki,args=(link,depth-1,)) | |
t.daemon = True | |
threadList.append(t) | |
t.start() | |
else: | |
self.crawlWiki(link,depth-1) | |
for t in threadList: | |
t.join() | |
else: | |
#url is not http | |
pass | |
def finalizeDirectory(self): | |
os.chdir('..') | |
def run(self): | |
self.initDirectory() | |
crawlList = self.getURL() | |
if MULTIPLE_RESULTS_SET: | |
threads=[] | |
for url in crawlList: | |
t = threading.Thread(target=self.crawlWiki,args=(url,self._search_depth,)) | |
t.daemon = True | |
threads.append(t) | |
t.start() | |
#self.crawlWiki(url,self._search_depth) | |
for t in threads: | |
t.join() | |
else: | |
self.crawlWiki(crawlList[0],self._search_depth) | |
self.finalizeDirectory() | |
if __name__=='__main__': | |
app = wikiCrawler() | |
app.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment