alexohneander/scraper.py

## scraper.py
import requests
import re
from bs4 import BeautifulSoup


host = input('Enter Website: ')

# Collect and parse first page
page = requests.get(host)
soup = BeautifulSoup(page.text, 'html.parser')
links = []

# Pull all text from the BodyText div
projectList = soup.find("body")

for link in soup.findAll('a', attrs={'href': re.compile("^/")}):
    links.append(link.get('href'))


# Go through all links
for link in links:
    # Collect and parse first page
    page = requests.get(host + link)
    soup = BeautifulSoup(page.text, 'html.parser')

    print('')
    print('Sitename: ' + link)
    print('########################################')
    print('')

    # Pull all text from the BodyText div
    projectList = soup.find("div", {"id": "master"})

    # Pull text from all instances of <a> tag within BodyText div
    projectItems = projectList.find_all('img')

    print('All Images with Alt Tags: ')
    print('----------------------------------------')
    for project in projectItems:

        altTags = project.get('alt')
        print(altTags)


        #contents
        #print(project.contents[0])
        #print(project.prettify())
	import requests
	import re
	from bs4 import BeautifulSoup


	host = input('Enter Website: ')

	# Collect and parse first page
	page = requests.get(host)
	soup = BeautifulSoup(page.text, 'html.parser')
	links = []

	# Pull all text from the BodyText div
	projectList = soup.find("body")

	for link in soup.findAll('a', attrs={'href': re.compile("^/")}):
	links.append(link.get('href'))






	# Go through all links
	for link in links:
	# Collect and parse first page
	page = requests.get(host + link)
	soup = BeautifulSoup(page.text, 'html.parser')

	print('')
	print('Sitename: ' + link)
	print('########################################')
	print('')

	# Pull all text from the BodyText div
	projectList = soup.find("div", {"id": "master"})

	# Pull text from all instances of <a> tag within BodyText div
	projectItems = projectList.find_all('img')

	print('All Images with Alt Tags: ')
	print('----------------------------------------')
	for project in projectItems:

	altTags = project.get('alt')
	print(altTags)


	#contents
	#print(project.contents[0])
	#print(project.prettify())