Created
August 6, 2021 19:38
-
-
Save yanissi/f0df4aae12361c8dd4527027376fc358 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get External Links List RAW | |
def ExternalLinkList(listPages): | |
externalLinksListRaw = [] | |
count = 0 | |
length_list = len(listPages) | |
user_agent = {'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} | |
for url in listPages: | |
count = count + 1 | |
request = requests.get(url, headers=user_agent) | |
content = request.content | |
soup = BeautifulSoup(content, 'lxml') | |
list_of_links = soup.find_all("a") | |
for link in list_of_links: | |
try: | |
if yourDomain in link["href"] or "http" not in link["href"]: | |
pass | |
else: | |
externalLinksListRaw.append([url,link["href"],link.text]) | |
except: | |
pass | |
print(count,"pages checked out of ",length_list,".") | |
return externalLinksListRaw | |
# Get External Links List Unique Values | |
def getUniqueExternalLinks(externalLinksListRaw): | |
uniqueExternalLinks = [] | |
for link in externalLinksListRaw: | |
if link[1] in uniqueExternalLinks: | |
pass | |
else: | |
uniqueExternalLinks.append(link[1]) | |
return uniqueExternalLinks |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment