GnsP/wikispider

## wikispider
#!/usr/bin/python3

'''
This script is a web crawler (spider) that searches the wikipedia pages on a
given keyword recursively and returns only relevant pages that contain relevant
info on the given topic in their contents section. It also creates a directory
under the present working directory and saves the webpages for further reading.

It also builds a graph representation of the dependencies among the webpages to
show the user how the pages and the fields are interlinked.

It runs on python 3.x . Natively built on Linux. '''

__author__ = 'Ganesh Prasad Sahoo, email: sir.gnsp@gmail.com'
__version__ = '0.1 beta'

import http.client as httplib
import re
import os
import optparse
import urllib.request as urllib2
import urllib.parse as urlparse
import json
import threading

MULTIPLE_RESULTS_SET = False

class wikiCrawler:
	def __init__(self):
		self._visited = set()
		self._topic = input("Enter the Search String :")
		self._dir_name = '_'.join(self._topic.split())
		self._search_depth = int(input("Enter Search Depth :"))
		self._unnamed_counter = 0
		self._url = "http://en.wikipedia.org"

	def getURL(self):
		srch_str = '+'.join(self._topic.split())+'+inurl:en.wikipedia.org'
		query = urlparse.urlencode({'q':srch_str})
		url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query
		search_response = urllib2.urlopen(url)
		search_results = search_response.read().decode()
		print(search_results)
		results = json.loads(search_results)
		data = results['responseData']
		hits = data['results']
		if len(hits) > 0:
			self._url = hits[0]['url']
		return [str(h['url']) for h in hits]

	def initDirectory(self):
		os.mkdir(self._dir_name)
		os.chdir(self._dir_name)

	def saveData(self,filename,content):
		f = open(filename,"w+")
		f.write(content)
		f.close()

	def getMainDiv(self,content):
		start = content.find('div id="mw-content-text"')
		counter = 0
		index = start+23
		while counter > -1:
			if content[index:index+5] == '<div ':
				counter += 1
				index += 5
			elif content[index:index+5] == '</div':
				counter -= 1
				index += 5
			else:
				index += 1
		return content[start:index]

	def crawlWiki(self, url, depth=1):
		if url.startswith("http://") and not url in self._visited:
			self._visited.add(url)
			url = url.replace("http://","",1)
			print('visiting :'+url)

			host = url
			path="/"
			filename = ''
			urlSegments = url.split("/")
			if len(urlSegments) > 1:
				host = urlSegments[0]
				path = url.replace(host,"",1)
				filename = urlSegments[-1].split('.')[0]+'.html'
			else:
				filename = 'default_'+str(self._unnamed_counter)+'.html'
				self._unnamed_counter += 1

			conn = httplib.HTTPConnection(host)
			req = conn.request("GET",path)
			try:
				res = conn.getresponse()
			except:
				return
			try:
				content = res.read().decode()
			except:
				return
			self.saveData(filename,content)

			m = re.findall('href="(.*?)"', self.getMainDiv(content))
			threadList = []
			for link in m:
				if link.startswith('/'):
					link = 'http://'+host+link
				if depth > 0:
					if len(threadList) < 50:
						t = threading.Thread(target=self.crawlWiki,args=(link,depth-1,))
						t.daemon = True
						threadList.append(t)
						t.start()
					else:
						self.crawlWiki(link,depth-1)
			for t in threadList:
				t.join()
		else:
			#url is not http
			pass

	def finalizeDirectory(self):
		os.chdir('..')

	def run(self):
		self.initDirectory()
		crawlList = self.getURL()
		if MULTIPLE_RESULTS_SET:
			threads=[]
			for url in crawlList:
				t = threading.Thread(target=self.crawlWiki,args=(url,self._search_depth,))
				t.daemon = True
				threads.append(t)
				t.start()
				#self.crawlWiki(url,self._search_depth)
			for t in threads:
				t.join()
		else:
			self.crawlWiki(crawlList[0],self._search_depth)
		self.finalizeDirectory()

if __name__=='__main__':
	app = wikiCrawler()


app.run()
	#!/usr/bin/python3

	'''
	This script is a web crawler (spider) that searches the wikipedia pages on a
	given keyword recursively and returns only relevant pages that contain relevant
	info on the given topic in their contents section. It also creates a directory
	under the present working directory and saves the webpages for further reading.

	It also builds a graph representation of the dependencies among the webpages to
	show the user how the pages and the fields are interlinked.

	It runs on python 3.x . Natively built on Linux. '''

	__author__ = 'Ganesh Prasad Sahoo, email: sir.gnsp@gmail.com'
	__version__ = '0.1 beta'

	import http.client as httplib
	import re
	import os
	import optparse
	import urllib.request as urllib2
	import urllib.parse as urlparse
	import json
	import threading

	MULTIPLE_RESULTS_SET = False

	class wikiCrawler:
	def __init__(self):
	self._visited = set()
	self._topic = input("Enter the Search String :")
	self._dir_name = '_'.join(self._topic.split())
	self._search_depth = int(input("Enter Search Depth :"))
	self._unnamed_counter = 0
	self._url = "http://en.wikipedia.org"

	def getURL(self):
	srch_str = '+'.join(self._topic.split())+'+inurl:en.wikipedia.org'
	query = urlparse.urlencode({'q':srch_str})
	url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % query
	search_response = urllib2.urlopen(url)
	search_results = search_response.read().decode()
	print(search_results)
	results = json.loads(search_results)
	data = results['responseData']
	hits = data['results']
	if len(hits) > 0:
	self._url = hits[0]['url']
	return [str(h['url']) for h in hits]

	def initDirectory(self):
	os.mkdir(self._dir_name)
	os.chdir(self._dir_name)

	def saveData(self,filename,content):
	f = open(filename,"w+")
	f.write(content)
	f.close()

	def getMainDiv(self,content):
	start = content.find('div id="mw-content-text"')
	counter = 0
	index = start+23
	while counter > -1:
	if content[index:index+5] == '<div ':
	counter += 1
	index += 5
	elif content[index:index+5] == '</div':
	counter -= 1
	index += 5
	else:
	index += 1
	return content[start:index]

	def crawlWiki(self, url, depth=1):
	if url.startswith("http://") and not url in self._visited:
	self._visited.add(url)
	url = url.replace("http://","",1)
	print('visiting :'+url)

	host = url
	path="/"
	filename = ''
	urlSegments = url.split("/")
	if len(urlSegments) > 1:
	host = urlSegments[0]
	path = url.replace(host,"",1)
	filename = urlSegments[-1].split('.')[0]+'.html'
	else:
	filename = 'default_'+str(self._unnamed_counter)+'.html'
	self._unnamed_counter += 1

	conn = httplib.HTTPConnection(host)
	req = conn.request("GET",path)
	try:
	res = conn.getresponse()
	except:
	return
	try:
	content = res.read().decode()
	except:
	return
	self.saveData(filename,content)

	m = re.findall('href="(.*?)"', self.getMainDiv(content))
	threadList = []
	for link in m:
	if link.startswith('/'):
	link = 'http://'+host+link
	if depth > 0:
	if len(threadList) < 50:
	t = threading.Thread(target=self.crawlWiki,args=(link,depth-1,))
	t.daemon = True
	threadList.append(t)
	t.start()
	else:
	self.crawlWiki(link,depth-1)
	for t in threadList:
	t.join()
	else:
	#url is not http
	pass

	def finalizeDirectory(self):
	os.chdir('..')

	def run(self):
	self.initDirectory()
	crawlList = self.getURL()
	if MULTIPLE_RESULTS_SET:
	threads=[]
	for url in crawlList:
	t = threading.Thread(target=self.crawlWiki,args=(url,self._search_depth,))
	t.daemon = True
	threads.append(t)
	t.start()
	#self.crawlWiki(url,self._search_depth)
	for t in threads:
	t.join()
	else:
	self.crawlWiki(crawlList[0],self._search_depth)
	self.finalizeDirectory()

	if __name__=='__main__':
	app = wikiCrawler()


	app.run()