Skip to content

Instantly share code, notes, and snippets.

@hn-support
Last active July 18, 2023 09:07
Show Gist options
  • Star 20 You must be signed in to star a gist
  • Fork 11 You must be signed in to fork a gist
  • Save hn-support/bc7cc401e3603a848a4dec4b18f3a78d to your computer and use it in GitHub Desktop.
Save hn-support/bc7cc401e3603a848a4dec4b18f3a78d to your computer and use it in GitHub Desktop.
A threaded cache warmer in python
#!/usr/bin/env python
"""
Warm the caches of your website by crawling each page defined in sitemap.xml.
To use, download this file and make it executable. Then run:
./cache-warmer.py --threads 4 --file /data/web/public/sitemap.xml -v
"""
import argparse
import multiprocessing.pool as mpool
import os.path
import re
import sys
import time
import requests
import subprocess
results = []
start = time.time()
def parse_options():
parser = argparse.ArgumentParser(description="""Cache crawler based on a sitemap.xml file""")
parser.add_argument('-t', '--threads', help='How many threads to use', default=10, required=False, type=int)
parser.add_argument('-f', '--file', help='The sitemap xml file', required=True, type=str)
parser.add_argument('-v', '--verbose', help='Be more verbose', action='store_true', default=False)
args = parser.parse_args()
if not os.path.isfile(args.file):
parser.error('Could not find sitemap file %s' % args.file)
return args
def crawl_url(url, verbose=False):
if verbose:
print "Crawling {}".format(url)
a = requests.get(url, headers={"user-agent": "SitemapCacheWarmer"})
return {'exit': 0 if a.ok() else 1, 'out': a.text, 'url': url}
def make_results():
errcount = 0
exec_time = format(time.time() - start, '.4f')
for item in results:
if item['exit'] == 0:
continue
else:
errcount += 1
print "Errors detected in %s:\n%s\n" % (item['url'], item['out'])
print "=" * 50
if errcount == 0:
print "All DONE! - All urls are warmed! - done in %s " % exec_time
return 0
else:
print "%d Errors detected! - done in %ss" % (errcount, exec_time)
return 1
def get_sitemap_urls(filename):
with open(filename) as fh:
return re.findall('<loc>(.*?)</loc>?', fh.read())
def callback(output):
results.append(output)
def main():
args = parse_options()
sitemap_urls = get_sitemap_urls(args.file)
if args.verbose:
print "Crawling {} urls with {} threads\n[Please Wait!]".format(len(sitemap_urls), args.threads)
print "=" * 50
pool = mpool.ThreadPool(args.threads)
for url in sitemap_urls:
pool.apply_async(crawl_url, args=(url,), callback=callback)
pool.close()
pool.join()
sys.exit(make_results())
if __name__ == "__main__":
main()
@tdgroot
Copy link

tdgroot commented Sep 23, 2019

If you encounter the error 'ImportError: No module named ordered_dict', please run the following command:
pip install -U requests

@hongxy
Copy link

hongxy commented May 6, 2020

can this warmer run url request in batch.. for example, I want to run the script every 10 minutes, and each run will do 100 urls.. instead of all urls in the sitemap. thanks!
Jackey

@svsites
Copy link

svsites commented Jul 24, 2020

how to call page url instead of using sitemap?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment