Skip to content

Instantly share code, notes, and snippets.

@catalanojuan
Created October 10, 2012 14:30
Show Gist options
  • Save catalanojuan/3865989 to your computer and use it in GitHub Desktop.
Save catalanojuan/3865989 to your computer and use it in GitHub Desktop.
Generate urls CSV from a bunch of sitemaps.
# -*- encoding: utf8 -*-
#!/usr/bin/env python
import csv
import os
from BeautifulSoup import BeautifulStoneSoup
import yaml
DEFAULTS = {
'sitemaps_dir': 'sitemaps/',
'output_file': 'urls.csv',
}
def read_options():
try:
f = open('generator-options.yml', 'r')
return yaml.load(f.read())
except IOError as e:
if e.errno == 2:
return {}
raise e
def main():
options = read_options() or DEFAULTS
filenames = os.listdir(options['sitemaps_dir'])
output = csv.writer(open(options['output_file'], 'w'), delimiter=',',
quotechar='"')
for filename in filenames:
path = options['sitemaps_dir'] + filename
if not os.path.isfile(path):
continue
f = open(path, 'r')
soup = BeautifulStoneSoup(f.read())
urls = [url.text for url in soup('loc')]
for url in urls:
output.writerow([url.encode('utf-8')])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment