Skip to content

Instantly share code, notes, and snippets.

@shizhao
Last active December 27, 2016 08:07
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save shizhao/83051e70361b7ebcaebc to your computer and use it in GitHub Desktop.
Save shizhao/83051e70361b7ebcaebc to your computer and use it in GitHub Desktop.
Panoramio upload bot
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script to upload images of panoramio to wikimedia commons.
by Shizhao 2014
"""
import urllib2,re, random
import datetime
from BeautifulSoup import BeautifulSoup
import upload,time, StringIO, hashlib, base64
import tempfile
import pywikibot
import pywikibot.data.api
from pywikibot import config
import sys
from multiprocessing.dummy import Pool as ThreadPool
reload(sys) # Python2.5 初始化后会删除 sys.setdefaultencoding 这个方法,我们需要重新载入
sys.setdefaultencoding('utf-8')
site = pywikibot.Site(u'commons', u'commons')
def cleanUpTitle(title, site, author, project = u'panoramio'):
''' Clean up the title of a potential mediawiki page. Otherwise the title of
the page might not be allowed by the software.
from flickrripper.py
'''
maxBytes = 240 - len(project.encode('utf-8')) \
- len(author.encode('utf-8'))
titleBytes = len(title.encode('utf-8'))
if titleBytes > maxBytes:
# maybe we cut more than needed, anyway we do it
items = max(min(len(title), maxBytes / 4),
len(title) - titleBytes + maxBytes)
title = title[:items]
title = title.strip()
title = re.sub(u"^ $", u"Untitled", title)
title = re.sub(u"[<{\\[]", u"(", title)
title = re.sub(u"[>}\\]]", u")", title)
title = re.sub(u"[ _]?\\(!\\)", u"", title)
title = re.sub(u",:[ _]", u", ", title)
title = re.sub(u"[;:][ _]", u", ", title)
title = re.sub(u"[\t\n ]+", u" ", title)
title = re.sub(u"[\r\n ]+", u" ", title)
title = re.sub(u"[\n]+", u"", title)
title = re.sub(u"[?!]([.\"]|$)", u"\\1", title)
title = re.sub(u"[&#%?!]", u"^", title)
title = re.sub(u"[;]", u",", title)
title = re.sub(u"[/+\\\\:]", u"-", title)
title = re.sub(u"--+", u"-", title)
title = re.sub(u",,+", u",", title)
title = re.sub(u"[-,^]([.]|$)", u"\\1", title)
title = title.replace(u" ", u"_")
if not site:
site = pywikibot.Site(u'commons', u'commons')
try:
if pywikibot.Page(site, u'File:%s - %s.jpg' % (title, project)).exists():
while True:
if pywikibot.Page(site, u'File:%s - %s - %s.jpg' % (title, project, author)).exists():
i = 1
while True:
if (pywikibot.Page(site, u'File:%s - %s - %s (%d).jpg' % (title, project, author, i)).exists()):
i += 1
else:
return u'%s - %s - %s (%d).jpg' % (title, project, author, i)
else:
return u'%s - %s - %s.jpg' % (title, project, author)
else:
return u'File:%s - %s.jpg' % (title, project)
except AttributeError:
return u'File:%s - %s.jpg' % (title, project)
def checkcountry(cats):
'''remove country category'''
for countrycat in cats:
if u'Countries' not in countrycat.title(withNamespace=False):
country = False
else:
country = True
break
return country
def checkyear(category):
'''remove year category'''
if category.isdigit():
checkyear = True
else:
checkyear = False
return checkyear
def checkmetacategories(cats):
'''remove Meta categories'''
for metacat in cats:
if u'Meta categories' not in metacat.title(withNamespace=False) or metacat.title(withNamespace=False) <> u'Topics':
checkmeta = False
else:
checkmeta = True
break
return checkmeta
def checkmaincategory(categorypage):
'''remove main category'''
templates = categorypage.templates()
maincattemp=[u'MetaCat', u'CatDiffuse', u'Categorise', u'CatCat', u'Disambig']
for temp in templates:
if temp.title(withNamespace=False) in maincattemp:
checkmain = True
break
else:
checkmain = False
return checkmain
def buildDescription(Information, site, tags, Location):
''' Build the final description for the image.
'''
uploder = config.usernames['commons']['commons']
description = Information
catexists=len(tags)
if tags:
catexists=len(tags)
catstext = u''
for category in tags:
try:
categorypage=pywikibot.Page(site, u'Category:' + category)
if categorypage.exists():
if categorypage.isCategoryRedirect():
cats=categorypage.getCategoryRedirectTarget().categories()
if checkcountry(cats) or checkyear(category) or checkmetacategories(cats) or checkmaincategory(categorypage):
catexists = catexists -1
pywikibot.output(u'remove category: ' + category)
else:
if u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' not in catstext:
catstext = catstext + u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n'
pywikibot.output(u'RedirectTarget: '+categorypage.getCategoryRedirectTarget().title())
else:
catexists = catexists -1
pywikibot.output(u'remove category: ' + category + '. Dupe')
else:
cats=categorypage.categories()
if checkcountry(cats) or checkyear(category) or checkmetacategories(cats) or checkmaincategory(categorypage):
catexists = catexists -1
pywikibot.output(u'remove category: ' + category)
else:
catstext = catstext + u'[[Category:' + category + ']]\n'
else:
catexists = catexists -1
pywikibot.output(u'[[Category:' + category + ']]' + u' not exists.')
except:
catexists = catexists -1
if catexists == 0:
if Location ==u'':
description = description + u'{{subst:unc}}\n'
else:
description = description + u'{{subst:unc|geo=1}}\n'
else:
description = description+u'{{subst:chc}}\n\n' + catstext
else:
if Location ==u'':
description = description + u'{{subst:unc}}\n'
else:
description = description + u'{{subst:unc|geo=1}}\n'
uploder = config.usernames['commons']['commons']
description = description + u'[[Category:Panoramio files uploaded by ' + uploder + ']]\n'
return description
def downloadPhoto(photoUrl = ''):
'''
Download the photo and store it in a StrinIO.StringIO object.
TODO: Add exception handling
'''
imageFile=urllib2.urlopen(photoUrl).read()
return StringIO.StringIO(imageFile)
def findDuplicateImages(photo=None,
site=pywikibot.Site(u'commons', u'commons')):
''' Takes the photo, calculates the SHA1 hash and asks the mediawiki api
for a list of duplicates.
TODO: Add exception handling, fix site thing
'''
hashObject = hashlib.sha1()
hashObject.update(photo.getvalue())
return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
start = time.time()
add=5
page = pywikibot.Page(site, u"User:Panoramio upload bot/status")
text = page.text
#r=int(text)+1
r=int(text)
for arg in pywikibot.handleArgs():
if arg:
if arg.startswith('-start:'):
r = int(arg[7:])
elif arg.startswith('-range:'):
add = int(arg[7:])
def runputbot(i):
#for i in range(r,r+add):
site = pywikibot.Site(u'commons', u'commons')
url="http://www.panoramio.com/photo/%d" % i
try:
page = urllib2.urlopen(url)
urlverify=True
except (urllib2.HTTPError, urllib2.URLError), e:
urlverify=False
print 'photo id %d' % i, e
#sec=random.randint(1, 5)
#pywikibot.output(u"Waiting for %d seconds." % sec)
#time.sleep(sec)
while urlverify:
soup = BeautifulSoup(page)
license = soup.find('li', attrs={'class' : re.compile("^license")})['class']
#print license
if license=="license by-sa" or license== "license by":
pywikibot.output('photo id %d %s is OK! Ready upload Commons...' % (i, license))
photo_url = "http://static.panoramio.com/photos/original/%d.jpg" % i
#print photo_url
#Should download the photo only once
trying = True
n=0
while trying:
try:
photo = downloadPhoto(photo_url)
trying=False
except (urllib2.HTTPError,urllib2.URLError), e:
pywikibot.output(e.code + ' try....')
n=n+1
time.sleep(5*n*n)
#Don't upload duplicate images, should add override option
duplicates = findDuplicateImages(photo)
if duplicates:
pywikibot.output(u'Found duplicate image at %s' % duplicates.pop())
urlverify=False
else:
#photo title
title=soup.find("h1", { "id" : "photo-title" }).contents[0].strip().lstrip().rstrip(',')
#author and author _url
author=soup.find("a", { "rel" : "author" }).contents[0]
author_url = soup.find("a", { "rel" : "author" })['href']
author_url= 'http://www.panoramio.com' + author_url
#clean filename
filename=cleanUpTitle(title, site, author)
pywikibot.output(filename,toStdout=True)
#print str(filename.decode('utf-8'))
#tags
tags=[]
for tag in soup.findAll(attrs={'id' : re.compile("^tag_element")}):
tags.append(tag.a.contents[0].strip().lstrip().rstrip(','))
try:
mapname=soup.find("div", { "id" : "map_info_name" }).a.contents[0]
if mapname not in tags:
tags.append(mapname)
print tags
except AttributeError:
print tags
#date
try:
date=soup.find("li", { "id" : "tech-details" }).findNext('ul').find(text=re.compile("^Taken on"))
formatdate=datetime.datetime.strptime(date,'Taken on %Y/%m/%d %X')
date = formatdate.strftime('%Y-%m-%d')
date=u'{{Taken on|%s}}' % date
print date
except (AttributeError, TypeError):
date=soup.find("ul", { "id" : "details" }).li.contents[0].strip().lstrip().rstrip(',')
formatdate=datetime.datetime.strptime(date,'Uploaded on %B %d, %Y')
date = formatdate.strftime('%Y-%m-%d')
date =u'{{Original upload date|%s}}' % date
print 'take on not found.', date
#geo
try:
lat =soup.find("abbr", { "class" : "latitude" })['title']
lon = soup.find("abbr", { "class" : "longitude" })['title']
Location = u'{{Location|%s|%s|source:Panoramio}}' %(lat, lon)
except TypeError:
Location = u''
if license == "license by-sa":
licensetag = u'{{cc-by-sa-3.0|%s}}' % author
elif license == "license by":
licensetag = u'{{cc-by-3.0|%s}}' % author
reviewer = config.usernames['commons']['commons']
review = u'{{Panoramioreview|%s|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}' % reviewer
#print soup.find("div", { "id" : "photo-description-formatted" }).contents[0]
try:
photo_description = u''.join(unicode(item) for item in soup.find("div", { "id" : "photo-description-formatted" }).contents).strip().lstrip().rstrip(',')
if photo_description:
title=title+u'\n\n'+ photo_description
else:
pass
except AttributeError:
pass
Information=u'{{Information\n|description=%s\n|date=%s\n|source=%s\n|author=[%s %s]\n|permission=%s\n%s\n|other_versions=\n|other_fields=\n}}\n%s\n\n' % (title, date, url, author_url, author, licensetag, review, Location)
#site = pywikibot.Site(u'commons', u'commons')
Description = buildDescription(Information, site, tags, Location)
#pywikibot.output(Description)
bot = upload.UploadRobot(photo_url,
description=Description,
useFilename=filename,
keepFilename=True,
verifyDescription=False,
ignoreWarning=True,uploadByUrl=True)
uploadtoo = True
n=0
while uploadtoo:
try:
bot.upload_image(debug=True)
urlverify=False
uploadtoo = False
#sec=random.randint(1, 10)
#pywikibot.output(u"Finished upload. Waiting for %d seconds." % sec)
#time.sleep(sec)
except e:
n=n+1
print e.code
#time.sleep(5*n*n)
else:
pywikibot.output('photo id %d %s is invalid! Ignore...' % (i, license))
#sec=random.randint(1, 5)
#pywikibot.output(u"Waiting for %d seconds." % sec)
#time.sleep(sec)
urlverify=False
# Make the Pool of workers
pool = ThreadPool(4)
pool.map(runputbot, range(r,r+add))
#close the pool and wait for the work to finish
pool.close()
pool.join()
print "Elapsed Time: %s" % (time.time() - start)
#update upload status
i=r+add
statuspage = pywikibot.Page(site, u"User:Panoramio upload bot/status")
statuspage.text = u'%d' % i
statuspage.save(u"update upload status: %d" % i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment