rudresh-ajgaonkar/SampleScript.py

## SampleScript.py
#! usr/bin/python env
import requests
import urlparse
from bs4 import BeautifulSoup
from collections import deque
import time

START_LINK = 'https://en.wikipedia.org/wiki/Sustainable_energy'
BASE_URL = 'https://en.wikipedia.org'


def crawling_init2():
    global urlcounter
    global depth
    urlcounter = 0
    depth = 1
    q = deque()
    visited = deque()
    # visited = deque()
    q.append(START_LINK)
    while(urlcounter < 1000 and depth < 5 and len(q)):
        temp = q.popleft()
        visited.append(temp)
        urlcounter = urlcounter + 1
        sumer = 0
        # print "Point1"
        tempo = []
        links = parse(temp, urlcounter, visited)
        a = []
        # print "Point 2"
        for link in links:
            if link not in q:
                a.append(link)
                q.append(link)
        sumer = sumer + len(a)
        print "SUM"+str(sumer)
        for link in a:
            urlcounter = urlcounter + 1
            if urlcounter < 1000:
                links = parse(link, urlcounter, visited)
                tempo.append(links)
        sumer = sumer + len(tempo)
        print "SUM"+str(sumer)
        print "URL-COUNTER"+str(urlcounter)
        if urlcounter > sumer:
            depth = depth + 1
            print "depth"+str(depth)


def parse(url, urlcounter, visited):
    # time.sleep(2)
    print str(urlcounter)+"--"+url
    with open('data/All_URL.txt', 'a') as ff:
        ff.write(url+"\n")
    res = requests.get(url, {"user-agent": "Mozilla/5.0"})
    soup = BeautifulSoup(res.content, 'html.parser')
    with open('data/corpus/content'+str(urlcounter)+".txt", 'wb') as fa:
        fa.write(res.content)
    All_links = soup.find_all('a', href=True)
    tempLinks = []
    temp = []
    for link in All_links:
        tempLinks.append(link['href'])
    tempset = []
    with open('data/link/links'+str(urlcounter)+".txt", 'wb') as fb:
        for link in tempLinks:
            if "/wiki/" in link and ":" not in link and urlparse.urldefrag(link)[1] == '' and link.split("/")[1] == "wiki":
                absolute_url = urlparse.urljoin(BASE_URL, link)
                if absolute_url not in tempset and absolute_url not in visited:
                    tempset.append(absolute_url)
                    fb.write(absolute_url+"\n")
    return tempset


if __name__ == "__main__":
    crawling_init2()
	#! usr/bin/python env
	import requests
	import urlparse
	from bs4 import BeautifulSoup
	from collections import deque
	import time

	START_LINK = 'https://en.wikipedia.org/wiki/Sustainable_energy'
	BASE_URL = 'https://en.wikipedia.org'


	def crawling_init2():
	global urlcounter
	global depth
	urlcounter = 0
	depth = 1
	q = deque()
	visited = deque()
	# visited = deque()
	q.append(START_LINK)
	while(urlcounter < 1000 and depth < 5 and len(q)):
	temp = q.popleft()
	visited.append(temp)
	urlcounter = urlcounter + 1
	sumer = 0
	# print "Point1"
	tempo = []
	links = parse(temp, urlcounter, visited)
	a = []
	# print "Point 2"
	for link in links:
	if link not in q:
	a.append(link)
	q.append(link)
	sumer = sumer + len(a)
	print "SUM"+str(sumer)
	for link in a:
	urlcounter = urlcounter + 1
	if urlcounter < 1000:
	links = parse(link, urlcounter, visited)
	tempo.append(links)
	sumer = sumer + len(tempo)
	print "SUM"+str(sumer)
	print "URL-COUNTER"+str(urlcounter)
	if urlcounter > sumer:
	depth = depth + 1
	print "depth"+str(depth)


	def parse(url, urlcounter, visited):
	# time.sleep(2)
	print str(urlcounter)+"--"+url
	with open('data/All_URL.txt', 'a') as ff:
	ff.write(url+"\n")
	res = requests.get(url, {"user-agent": "Mozilla/5.0"})
	soup = BeautifulSoup(res.content, 'html.parser')
	with open('data/corpus/content'+str(urlcounter)+".txt", 'wb') as fa:
	fa.write(res.content)
	All_links = soup.find_all('a', href=True)
	tempLinks = []
	temp = []
	for link in All_links:
	tempLinks.append(link['href'])
	tempset = []
	with open('data/link/links'+str(urlcounter)+".txt", 'wb') as fb:
	for link in tempLinks:
	if "/wiki/" in link and ":" not in link and urlparse.urldefrag(link)[1] == '' and link.split("/")[1] == "wiki":
	absolute_url = urlparse.urljoin(BASE_URL, link)
	if absolute_url not in tempset and absolute_url not in visited:
	tempset.append(absolute_url)
	fb.write(absolute_url+"\n")
	return tempset


	if __name__ == "__main__":
	crawling_init2()