Skip to content

Instantly share code, notes, and snippets.

@aquinzi
Created December 16, 2013 23:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aquinzi/7996821 to your computer and use it in GitHub Desktop.
Save aquinzi/7996821 to your computer and use it in GitHub Desktop.
Creates a simple sitemap
"""
Creates a simple sitemap, taking a source dir and translating to a site url; applying filters:
PATH_SITE: source path, must include / at the end ; also accesible from cli with -f
SITE_URL: must include final /; also accesible from cli with -s
EXTENSIONS_ACCEPTED: ('ext1', 'ext2')
SITEMAP_FILE: also accesible from cli with -xml
IGNORE_FILES: ('error.php', '.htaccess')
IGNORE_FOLDERS: ('.svn', 'imgs', 'src')
PRIORITY: from 0.0 to 1.0; also accesible from cli with -p
LASTMOD: boolean, check in files for modify times; also accesible from cli with -m
FREQUENCY: a string; also accesible from cli with -freq
ROBOTS: boolean; also accesible from cli with -r
"""
# ----------------------
# config
# ----------------------
PATH_SITE = "" #must include final /
SITE_URL = "www.example.com/" #must include final /
EXTENSIONS_ACCEPTED = ('php')
SITEMAP_FILE = "sitemap.xml"
IGNORE_FILES = ('error.php', '.htaccess', 'config.php')
IGNORE_FOLDERS = ('.svn', 'imgs', 'src')
PRIORITY = 0.5 # default from schema; decimals or integers from 0 - 10
LASTMOD = False # boolean, check in files for modify times
FREQUENCY = 'monthly' # leave empty if you dont want this
ROBOTS = False # boolean. False -> outputs the robots line, True creates new robots file with that line
# ----------------------
# stop touching!
# ----------------------
import os
import sys
import datetime
if sys.version_info[0] == 3:
from urllib.request import pathname2url
else:
from urllib import pathname2url
FREQUENCY_VALID = ('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never')
ARGS_OPTIONS = ('-f', '-s', '-p', '-freq', '-m', '-r', '-xml')
TPL_SITEMAP_DOC = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{body}
</urlset>
"""
TPL_SITEMAP_ITEM_BASE = """
<url>
<loc>{url}</loc>
<priority>{priority}</priority>{rest}
</url>
"""
TPL_SITEMAP_ITEM_REST = (
' <lastmod>{date}</lastmod>',
' <changefreq>{freq}</changefreq>',
)
def check_priority(number):
if isinstance(number, int):
number = float(number) / 10
if isinstance(number, float):
if number >= 0.0 and number <= 1.0:
return number
print (" Something wrong with priority number ")
sys.exit(0)
def check_frequency(freq):
if freq and freq.lower() not in FREQUENCY_VALID:
print (" Something wrong with frequency ")
sys.exit(0)
return freq.lower()
def list_files():
thisones = list()
for root, subFolders, files in os.walk(PATH_SITE):
for ignore in IGNORE_FOLDERS:
if (ignore in subFolders):
subFolders.remove(ignore)
for fil in files:
if fil in IGNORE_FILES:
continue
for accept in EXTENSIONS_ACCEPTED:
if fil.endswith(accept):
meta = ""
complete_path = os.path.join(root, fil)
if LASTMOD:
t = os.path.getmtime(complete_path)
t = datetime.datetime.fromtimestamp(t).isoformat()
meta = t.split('T')[0] # only date
thisones.append([complete_path, meta])
break
return thisones
def make_url(files):
for index in range(len(files)):
files[index][0] = files[index][0].replace(PATH_SITE, SITE_URL)
files[index][0] = pathname2url(files[index][0])
return files
def make_sitemap_body(list_urls):
tiny_bits = list()
rest = ""
if FREQUENCY:
rest += '\n' + TPL_SITEMAP_ITEM_REST[1].format(freq=FREQUENCY)
rest_orig = str(rest)
for x in list_urls:
if LASTMOD:
rest += '\n' + TPL_SITEMAP_ITEM_REST[0].format(date=x[1])
tmp = TPL_SITEMAP_ITEM_BASE.format(url=x[0], priority=PRIORITY, rest=rest)
tiny_bits.append(tmp)
rest = rest_orig
tiny_bits = "".join(tiny_bits)
return tiny_bits
def make_sitemap(list_urls):
urls = make_sitemap_body(list_urls)
doc = TPL_SITEMAP_DOC.format(body=urls)
path = os.path.join(PATH_SITE, SITEMAP_FILE)
save(path, doc)
def make_robots():
path = os.path.join(PATH_SITE, "robots.txt")
text = "Sitemap: http://"+SITE_URL+"sitemap.xml"
if ROBOTS:
save(path, text)
else:
print (" This is your robot.txt line (you could do this yourself anyway) ")
print ("\n " + text )
def save(path, text):
with open(path, 'w') as output:
output.write(text)
def make_it(list_urls):
make_sitemap(all_urls)
make_robots()
def argv_process(args):
# make dir with key : value
new_args = {}
for i in range(0, len(args), 2):
key = args[i]
try:
val = args[i + 1]
except:
print ("missing value ")
sys.exit(0)
else:
if not val.startswith("-"):
new_args[key] = val
else:
print ("missing value ")
sys.exit(0)
#check all ok
for key, value in new_args.items():
if key not in (ARGS_OPTIONS):
print ("Wrong option ")
sys.exit(0)
else:
if key in ('-m', '-r'):
new_args[key] = bool(new_args[key])
if key == '-p':
new_args[key] = float(new_args[key])
return new_args
if len(sys.argv) > 1:
for item in sys.argv:
if item in ("help", "-h", "--help"):
print (" Basic Sitemap generation, edit file or use this options: ")
print (' -f for path in filesystem/source folder')
print (' -s site url')
print (' -p priority. 0.0 - 1.0 ')
print (' -freq frequency (always, never, monthly...) ')
print (' -m boolean, check for file modification date ')
print (' -r boolean, create new robot file (true) ')
print (' -xml sitemap name ')
sys.exit(0)
options = argv_process(sys.argv[1:])
#map options to vars
for key, value in options.items():
if key == '-f':
PATH_SITE = value
elif key == '-s':
SITE_URL = value
elif key == '-p':
PRIORITY = value
elif key == '-freq':
FREQUENCY = value
elif key == '-m':
LASTMOD = value
elif key == '-r':
ROBOTS = value
elif key == '-xml':
SITEMAP_FILE = value
if not os.path.exists(PATH_SITE) or not os.path.isdir(PATH_SITE):
#takes current working dir
PATH_SITE = os.getcwd();
PRIORITY = check_priority(PRIORITY)
FREQUENCY = check_frequency(FREQUENCY)
allFiles = list_files()
all_urls = make_url(allFiles)
make_it(all_urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment