Skip to content

Instantly share code, notes, and snippets.

@rudresh-ajgaonkar
Last active November 6, 2016 04:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rudresh-ajgaonkar/5a900c8bd331d150b1229a26326b1c6c to your computer and use it in GitHub Desktop.
Save rudresh-ajgaonkar/5a900c8bd331d150b1229a26326b1c6c to your computer and use it in GitHub Desktop.
#! usr/bin/python env
import requests
import urlparse
from bs4 import BeautifulSoup
from collections import deque
import time
START_LINK = 'https://en.wikipedia.org/wiki/Sustainable_energy'
BASE_URL = 'https://en.wikipedia.org'
def crawling_init2():
global urlcounter
global depth
urlcounter = 0
depth = 1
q = deque()
visited = deque()
# visited = deque()
q.append(START_LINK)
while(urlcounter < 1000 and depth < 5 and len(q)):
temp = q.popleft()
visited.append(temp)
urlcounter = urlcounter + 1
sumer = 0
# print "Point1"
tempo = []
links = parse(temp, urlcounter, visited)
a = []
# print "Point 2"
for link in links:
if link not in q:
a.append(link)
q.append(link)
sumer = sumer + len(a)
print "SUM"+str(sumer)
for link in a:
urlcounter = urlcounter + 1
if urlcounter < 1000:
links = parse(link, urlcounter, visited)
tempo.append(links)
sumer = sumer + len(tempo)
print "SUM"+str(sumer)
print "URL-COUNTER"+str(urlcounter)
if urlcounter > sumer:
depth = depth + 1
print "depth"+str(depth)
def parse(url, urlcounter, visited):
# time.sleep(2)
print str(urlcounter)+"--"+url
with open('data/All_URL.txt', 'a') as ff:
ff.write(url+"\n")
res = requests.get(url, {"user-agent": "Mozilla/5.0"})
soup = BeautifulSoup(res.content, 'html.parser')
with open('data/corpus/content'+str(urlcounter)+".txt", 'wb') as fa:
fa.write(res.content)
All_links = soup.find_all('a', href=True)
tempLinks = []
temp = []
for link in All_links:
tempLinks.append(link['href'])
tempset = []
with open('data/link/links'+str(urlcounter)+".txt", 'wb') as fb:
for link in tempLinks:
if "/wiki/" in link and ":" not in link and urlparse.urldefrag(link)[1] == '' and link.split("/")[1] == "wiki":
absolute_url = urlparse.urljoin(BASE_URL, link)
if absolute_url not in tempset and absolute_url not in visited:
tempset.append(absolute_url)
fb.write(absolute_url+"\n")
return tempset
if __name__ == "__main__":
crawling_init2()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment