Skip to content

Instantly share code, notes, and snippets.

@alexohneander
Created December 21, 2017 08:57
Show Gist options
  • Save alexohneander/fbca4a2d9c03d13f46d888fadb873705 to your computer and use it in GitHub Desktop.
Save alexohneander/fbca4a2d9c03d13f46d888fadb873705 to your computer and use it in GitHub Desktop.
import requests
import re
from bs4 import BeautifulSoup
host = input('Enter Website: ')
# Collect and parse first page
page = requests.get(host)
soup = BeautifulSoup(page.text, 'html.parser')
links = []
# Pull all text from the BodyText div
projectList = soup.find("body")
for link in soup.findAll('a', attrs={'href': re.compile("^/")}):
links.append(link.get('href'))
# Go through all links
for link in links:
# Collect and parse first page
page = requests.get(host + link)
soup = BeautifulSoup(page.text, 'html.parser')
print('')
print('Sitename: ' + link)
print('########################################')
print('')
# Pull all text from the BodyText div
projectList = soup.find("div", {"id": "master"})
# Pull text from all instances of <a> tag within BodyText div
projectItems = projectList.find_all('img')
print('All Images with Alt Tags: ')
print('----------------------------------------')
for project in projectItems:
altTags = project.get('alt')
print(altTags)
#contents
#print(project.contents[0])
#print(project.prettify())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment