Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A threaded cache warmer in python
#!/usr/bin/env python
"""
Warm the caches of your website by crawling each page defined in sitemap.xml.
To use, download this file and make it executable. Then run:
./cache-warmer.py --threads 4 --file /data/web/public/sitemap.xml -v
"""
import argparse
import multiprocessing.pool as mpool
import os.path
import re
import sys
import time
import requests
import subprocess
results = []
start = time.time()
def parse_options():
parser = argparse.ArgumentParser(description="""Cache crawler based on a sitemap.xml file""")
parser.add_argument('-t', '--threads', help='How many threads to use', default=10, required=False, type=int)
parser.add_argument('-f', '--file', help='The sitemap xml file', required=True, type=str)
parser.add_argument('-v', '--verbose', help='Be more verbose', action='store_true', default=False)
args = parser.parse_args()
if not os.path.isfile(args.file):
parser.error('Could not find sitemap file %s' % args.file)
return args
def crawl_url(url, verbose=False):
if verbose:
print "Crawling {}".format(url)
a = requests.get(url, headers={"user-agent": "SitemapCacheWarmer"})
return {'exit': 0 if a.ok() else 1, 'out': a.text, 'url': url}
def make_results():
errcount = 0
exec_time = format(time.time() - start, '.4f')
for item in results:
if item['exit'] == 0:
continue
else:
errcount += 1
print "Errors detected in %s:\n%s\n" % (item['url'], item['out'])
print "=" * 50
if errcount == 0:
print "All DONE! - All urls are warmed! - done in %s " % exec_time
return 0
else:
print "%d Errors detected! - done in %ss" % (errcount, exec_time)
return 1
def get_sitemap_urls(filename):
with open(filename) as fh:
return re.findall('<loc>(.*?)</loc>?', fh.read())
def callback(output):
results.append(output)
def main():
args = parse_options()
sitemap_urls = get_sitemap_urls(args.file)
if args.verbose:
print "Crawling {} urls with {} threads\n[Please Wait!]".format(len(sitemap_urls), args.threads)
print "=" * 50
pool = mpool.ThreadPool(args.threads)
for url in sitemap_urls:
pool.apply_async(crawl_url, args=(url,), callback=callback)
pool.close()
pool.join()
sys.exit(make_results())
if __name__ == "__main__":
main()
@tdgroot

This comment has been minimized.

Copy link

tdgroot commented Sep 23, 2019

If you encounter the error 'ImportError: No module named ordered_dict', please run the following command:
pip install -U requests

@hongxy

This comment has been minimized.

Copy link

hongxy commented May 6, 2020

can this warmer run url request in batch.. for example, I want to run the script every 10 minutes, and each run will do 100 urls.. instead of all urls in the sitemap. thanks!
Jackey

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.