Skip to content

Instantly share code, notes, and snippets.

@hay
Created November 6, 2010 21:53
Show Gist options
  • Save hay/665734 to your computer and use it in GitHub Desktop.
Save hay/665734 to your computer and use it in GitHub Desktop.
Download all images in a category on Wikimedia Commons. Requires pywikipedia
#!/usr/bin/python
# -*- coding: utf-8 -*-
# -cat:Images_from_Wiki_Loves_Monuments
import sys, os
# Set your path to pywikipedia here
sys.path.append("/Users/hay/htdocs/checkouts/pywikipedia/")
import wikipedia, config, pagegenerators
import urllib2, codecs
def downloadFile(imagepage, target):
html = imagepage.getImagePageHtml()
fileUrl = imagepage.fileUrl()
filename = target + imagepage.titleWithoutNamespace()
if os.path.exists(target):
print "Target exists: " + filename
else:
print "Downloading " + filename
# Download the image
uo = wikipedia.MyURLopener
remotefile = uo.open(fileUrl)
# Store the image
localfile = open(filename, "wb")
localfile.write(remotefile.read())
localfile.close()
# Store the html
localhtmlfile = codecs.open(filename + u'.html', 'wb', 'utf-8')
localhtmlfile.write(html)
localhtmlfile.close()
print "Done!"
return
def main():
wikipedia.setSite(wikipedia.getSite(u'commons', u'commons'))
generator = None
genFactory = pagegenerators.GeneratorFactory()
for arg in wikipedia.handleArgs():
if arg.startswith('-target:'):
target = arg [len('-target:'):]
else:
genFactory.handleArg(arg)
generator = genFactory.getCombinedGenerator()
if generator:
# Get a preloading generator with only images
pgenerator = pagegenerators.PreloadingGenerator(pagegenerators.NamespaceFilterPageGenerator(generator, [6]))
for page in pgenerator:
imagepage = wikipedia.ImagePage(page.site(), page.title())
downloadFile(imagepage, target)
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment