Skip to content

Instantly share code, notes, and snippets.

@moh53n
Created August 19, 2022 20:05
Show Gist options
  • Save moh53n/5b6eb29c38c5979a1b7a8ee54b3c96e5 to your computer and use it in GitHub Desktop.
Save moh53n/5b6eb29c38c5979a1b7a8ee54b3c96e5 to your computer and use it in GitHub Desktop.
import requests
from lxml import html
from time import sleep
header = {}
status = {}
urls = []
visited_urls = []
entry = "https://microsoft.com/"
urls.append(entry)
counter = 0
while (True):
if (counter >= 20):
break
target = urls[counter]
if (target in visited_urls):
counter += 1
visited_urls.append(target)
continue
if (not "microsoft.com" in target):
counter += 1
continue
visited_urls.append(target)
try:
sleep(2)
res = requests.get(target)
except requests.exceptions.MissingSchema:
counter += 1
visited_urls.append(target)
continue
header[target] = res.headers
status[target] = res.status_code
page = html.fromstring(res.content)
urls.extend(page.xpath('//a/@href'))
print("==={}===".format(target))
print(status[target])
print(header[target])
print(urls)
counter += 1
print(header)
print("===")
print(status)
print("===")
print(urls)
print("===")
print(visited_urls)
@shahryarjalilehvand
Copy link

👍🏽👌

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment