Created
December 16, 2013 23:28
-
-
Save aquinzi/7996821 to your computer and use it in GitHub Desktop.
Creates a simple sitemap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Creates a simple sitemap, taking a source dir and translating to a site url; applying filters: | |
PATH_SITE: source path, must include / at the end ; also accesible from cli with -f | |
SITE_URL: must include final /; also accesible from cli with -s | |
EXTENSIONS_ACCEPTED: ('ext1', 'ext2') | |
SITEMAP_FILE: also accesible from cli with -xml | |
IGNORE_FILES: ('error.php', '.htaccess') | |
IGNORE_FOLDERS: ('.svn', 'imgs', 'src') | |
PRIORITY: from 0.0 to 1.0; also accesible from cli with -p | |
LASTMOD: boolean, check in files for modify times; also accesible from cli with -m | |
FREQUENCY: a string; also accesible from cli with -freq | |
ROBOTS: boolean; also accesible from cli with -r | |
""" | |
# ---------------------- | |
# config | |
# ---------------------- | |
PATH_SITE = "" #must include final / | |
SITE_URL = "www.example.com/" #must include final / | |
EXTENSIONS_ACCEPTED = ('php') | |
SITEMAP_FILE = "sitemap.xml" | |
IGNORE_FILES = ('error.php', '.htaccess', 'config.php') | |
IGNORE_FOLDERS = ('.svn', 'imgs', 'src') | |
PRIORITY = 0.5 # default from schema; decimals or integers from 0 - 10 | |
LASTMOD = False # boolean, check in files for modify times | |
FREQUENCY = 'monthly' # leave empty if you dont want this | |
ROBOTS = False # boolean. False -> outputs the robots line, True creates new robots file with that line | |
# ---------------------- | |
# stop touching! | |
# ---------------------- | |
import os | |
import sys | |
import datetime | |
if sys.version_info[0] == 3: | |
from urllib.request import pathname2url | |
else: | |
from urllib import pathname2url | |
FREQUENCY_VALID = ('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never') | |
ARGS_OPTIONS = ('-f', '-s', '-p', '-freq', '-m', '-r', '-xml') | |
TPL_SITEMAP_DOC = """<?xml version="1.0" encoding="UTF-8"?> | |
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> | |
{body} | |
</urlset> | |
""" | |
TPL_SITEMAP_ITEM_BASE = """ | |
<url> | |
<loc>{url}</loc> | |
<priority>{priority}</priority>{rest} | |
</url> | |
""" | |
TPL_SITEMAP_ITEM_REST = ( | |
' <lastmod>{date}</lastmod>', | |
' <changefreq>{freq}</changefreq>', | |
) | |
def check_priority(number): | |
if isinstance(number, int): | |
number = float(number) / 10 | |
if isinstance(number, float): | |
if number >= 0.0 and number <= 1.0: | |
return number | |
print (" Something wrong with priority number ") | |
sys.exit(0) | |
def check_frequency(freq): | |
if freq and freq.lower() not in FREQUENCY_VALID: | |
print (" Something wrong with frequency ") | |
sys.exit(0) | |
return freq.lower() | |
def list_files(): | |
thisones = list() | |
for root, subFolders, files in os.walk(PATH_SITE): | |
for ignore in IGNORE_FOLDERS: | |
if (ignore in subFolders): | |
subFolders.remove(ignore) | |
for fil in files: | |
if fil in IGNORE_FILES: | |
continue | |
for accept in EXTENSIONS_ACCEPTED: | |
if fil.endswith(accept): | |
meta = "" | |
complete_path = os.path.join(root, fil) | |
if LASTMOD: | |
t = os.path.getmtime(complete_path) | |
t = datetime.datetime.fromtimestamp(t).isoformat() | |
meta = t.split('T')[0] # only date | |
thisones.append([complete_path, meta]) | |
break | |
return thisones | |
def make_url(files): | |
for index in range(len(files)): | |
files[index][0] = files[index][0].replace(PATH_SITE, SITE_URL) | |
files[index][0] = pathname2url(files[index][0]) | |
return files | |
def make_sitemap_body(list_urls): | |
tiny_bits = list() | |
rest = "" | |
if FREQUENCY: | |
rest += '\n' + TPL_SITEMAP_ITEM_REST[1].format(freq=FREQUENCY) | |
rest_orig = str(rest) | |
for x in list_urls: | |
if LASTMOD: | |
rest += '\n' + TPL_SITEMAP_ITEM_REST[0].format(date=x[1]) | |
tmp = TPL_SITEMAP_ITEM_BASE.format(url=x[0], priority=PRIORITY, rest=rest) | |
tiny_bits.append(tmp) | |
rest = rest_orig | |
tiny_bits = "".join(tiny_bits) | |
return tiny_bits | |
def make_sitemap(list_urls): | |
urls = make_sitemap_body(list_urls) | |
doc = TPL_SITEMAP_DOC.format(body=urls) | |
path = os.path.join(PATH_SITE, SITEMAP_FILE) | |
save(path, doc) | |
def make_robots(): | |
path = os.path.join(PATH_SITE, "robots.txt") | |
text = "Sitemap: http://"+SITE_URL+"sitemap.xml" | |
if ROBOTS: | |
save(path, text) | |
else: | |
print (" This is your robot.txt line (you could do this yourself anyway) ") | |
print ("\n " + text ) | |
def save(path, text): | |
with open(path, 'w') as output: | |
output.write(text) | |
def make_it(list_urls): | |
make_sitemap(all_urls) | |
make_robots() | |
def argv_process(args): | |
# make dir with key : value | |
new_args = {} | |
for i in range(0, len(args), 2): | |
key = args[i] | |
try: | |
val = args[i + 1] | |
except: | |
print ("missing value ") | |
sys.exit(0) | |
else: | |
if not val.startswith("-"): | |
new_args[key] = val | |
else: | |
print ("missing value ") | |
sys.exit(0) | |
#check all ok | |
for key, value in new_args.items(): | |
if key not in (ARGS_OPTIONS): | |
print ("Wrong option ") | |
sys.exit(0) | |
else: | |
if key in ('-m', '-r'): | |
new_args[key] = bool(new_args[key]) | |
if key == '-p': | |
new_args[key] = float(new_args[key]) | |
return new_args | |
if len(sys.argv) > 1: | |
for item in sys.argv: | |
if item in ("help", "-h", "--help"): | |
print (" Basic Sitemap generation, edit file or use this options: ") | |
print (' -f for path in filesystem/source folder') | |
print (' -s site url') | |
print (' -p priority. 0.0 - 1.0 ') | |
print (' -freq frequency (always, never, monthly...) ') | |
print (' -m boolean, check for file modification date ') | |
print (' -r boolean, create new robot file (true) ') | |
print (' -xml sitemap name ') | |
sys.exit(0) | |
options = argv_process(sys.argv[1:]) | |
#map options to vars | |
for key, value in options.items(): | |
if key == '-f': | |
PATH_SITE = value | |
elif key == '-s': | |
SITE_URL = value | |
elif key == '-p': | |
PRIORITY = value | |
elif key == '-freq': | |
FREQUENCY = value | |
elif key == '-m': | |
LASTMOD = value | |
elif key == '-r': | |
ROBOTS = value | |
elif key == '-xml': | |
SITEMAP_FILE = value | |
if not os.path.exists(PATH_SITE) or not os.path.isdir(PATH_SITE): | |
#takes current working dir | |
PATH_SITE = os.getcwd(); | |
PRIORITY = check_priority(PRIORITY) | |
FREQUENCY = check_frequency(FREQUENCY) | |
allFiles = list_files() | |
all_urls = make_url(allFiles) | |
make_it(all_urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment