Skip to content

Instantly share code, notes, and snippets.

@cpv123
Created August 21, 2019 17:30
Show Gist options
  • Save cpv123/e938a56803f71215129687beeb9a7340 to your computer and use it in GitHub Desktop.
Save cpv123/e938a56803f71215129687beeb9a7340 to your computer and use it in GitHub Desktop.
import requests
import re
from bs4 import BeautifulSoup as bs
def test_urls_on_page(initial_page_url):
r = requests.get(initial_page_url)
assert r.status_code == 200
content = r.content
html_soup = bs(content, "html.parser")
urls_found = html_soup.find_all("a", {"href": re.compile('http*')})
successful = []
not_found = []
failed = []
other = []
for item in urls_found:
url = item.get('href')
try:
r = requests.get(url)
if r.status_code == 200:
success.append(url)
if r.status_code == 404:
not_found.append(url)
else:
other.append(url)
except:
fail.append(url)
final_result = {
'successfull': successful,
'not_found': not_found,
'failed': failed,
'other': other,
}
return final_result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment