Skip to content

Instantly share code, notes, and snippets.

@ahmadthedev
Created July 12, 2023 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahmadthedev/f0abde8a98c478c9bd781df6cda745e1 to your computer and use it in GitHub Desktop.
Save ahmadthedev/f0abde8a98c478c9bd781df6cda745e1 to your computer and use it in GitHub Desktop.
import pandas as pd
import requests, csv, time, lxml, cchardet, re
from bs4 import BeautifulSoup
import concurrent.futures
sitemap = '' # Sitemap URL
s = requests.session()
sm = s.get(sitemap)
sm_soup = BeautifulSoup(sm.text, 'lxml')
links = [x.text for x in sm_soup.find_all('loc')]
print('Found {} links'.format(len(links)))
i = 0
failed_urls = []
success_urls = []
print('{} URLs already warmed up'.format(len(success_urls)))
for link in links:
if i == 0:
last_time = time.time()
if i % 100 == False:
print(" - {} completed -- took {} seconds".format(i, round(time.time()-last_time)))
last_time = time.time()
if link in success_urls:
continue
req = s.get(link)
if req.status_code == 200:
success_urls.append(link)
time.sleep(0.2)
else:
failed_urls.append(link)
i = i+1
if len(failed_urls) > 0:
print('')
print('')
print("{} URLs failed: ")
for x in failed_urls:
print(" - {}".format(x))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment