Created
September 14, 2014 18:02
-
-
Save mr-fool/68c2cb55adca97748c55 to your computer and use it in GitHub Desktop.
simple python 3 web crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import requests | |
import os | |
from bs4 import BeautifulSoup | |
def main(): | |
os.system('clear') # on linux / os x | |
#Gathering necessary input from user | |
choice = int( input("1 to use the basic function, 2 for the advance crawling, 3 for exit\n") ) | |
option(choice) | |
base_url = input("Please enter the base url\n"); | |
max_pages = int( input("Please enter maximum amount of that you want to crawl thru\n") ) | |
#Debug process | |
#print("base url",base_url) | |
#print("max pages", max_page) | |
def spider(base_url,max_pages): | |
page = 1 | |
while page <= max_pages: | |
url = base_url + str(page) | |
#storing all the information including headers in the variable source code | |
source_code = requests.get(url) | |
#sort source code and store only the plaintext | |
plain_text = source_code.text | |
#converting plain_text to Beautiful Soup object so the library can sort thru it | |
convert_data = BeautifulSoup(plain_text) | |
#sorting useful information | |
for link in convert_data.findAll('a', {'class': 'item-name'}): | |
href = base_url + link.get('href') #Building a clickable url | |
title = link.string #just the text not the html | |
#displaying the result back to the user | |
print(href) | |
print(title) | |
page = page + 1 | |
def option(choice): | |
if choice == 1 : | |
spider(base_url,max_pages) | |
elif choice == 3: | |
sys.exit() | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment