Last active
December 27, 2016 08:07
Panoramio upload bot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
Script to upload images of panoramio to wikimedia commons. | |
by Shizhao 2014 | |
""" | |
import urllib2,re, random | |
import datetime | |
from BeautifulSoup import BeautifulSoup | |
import upload,time, StringIO, hashlib, base64 | |
import tempfile | |
import pywikibot | |
import pywikibot.data.api | |
from pywikibot import config | |
import sys | |
from multiprocessing.dummy import Pool as ThreadPool | |
reload(sys) # Python2.5 初始化后会删除 sys.setdefaultencoding 这个方法,我们需要重新载入 | |
sys.setdefaultencoding('utf-8') | |
site = pywikibot.Site(u'commons', u'commons') | |
def cleanUpTitle(title, site, author, project = u'panoramio'): | |
''' Clean up the title of a potential mediawiki page. Otherwise the title of | |
the page might not be allowed by the software. | |
from flickrripper.py | |
''' | |
maxBytes = 240 - len(project.encode('utf-8')) \ | |
- len(author.encode('utf-8')) | |
titleBytes = len(title.encode('utf-8')) | |
if titleBytes > maxBytes: | |
# maybe we cut more than needed, anyway we do it | |
items = max(min(len(title), maxBytes / 4), | |
len(title) - titleBytes + maxBytes) | |
title = title[:items] | |
title = title.strip() | |
title = re.sub(u"^ $", u"Untitled", title) | |
title = re.sub(u"[<{\\[]", u"(", title) | |
title = re.sub(u"[>}\\]]", u")", title) | |
title = re.sub(u"[ _]?\\(!\\)", u"", title) | |
title = re.sub(u",:[ _]", u", ", title) | |
title = re.sub(u"[;:][ _]", u", ", title) | |
title = re.sub(u"[\t\n ]+", u" ", title) | |
title = re.sub(u"[\r\n ]+", u" ", title) | |
title = re.sub(u"[\n]+", u"", title) | |
title = re.sub(u"[?!]([.\"]|$)", u"\\1", title) | |
title = re.sub(u"[&#%?!]", u"^", title) | |
title = re.sub(u"[;]", u",", title) | |
title = re.sub(u"[/+\\\\:]", u"-", title) | |
title = re.sub(u"--+", u"-", title) | |
title = re.sub(u",,+", u",", title) | |
title = re.sub(u"[-,^]([.]|$)", u"\\1", title) | |
title = title.replace(u" ", u"_") | |
if not site: | |
site = pywikibot.Site(u'commons', u'commons') | |
try: | |
if pywikibot.Page(site, u'File:%s - %s.jpg' % (title, project)).exists(): | |
while True: | |
if pywikibot.Page(site, u'File:%s - %s - %s.jpg' % (title, project, author)).exists(): | |
i = 1 | |
while True: | |
if (pywikibot.Page(site, u'File:%s - %s - %s (%d).jpg' % (title, project, author, i)).exists()): | |
i += 1 | |
else: | |
return u'%s - %s - %s (%d).jpg' % (title, project, author, i) | |
else: | |
return u'%s - %s - %s.jpg' % (title, project, author) | |
else: | |
return u'File:%s - %s.jpg' % (title, project) | |
except AttributeError: | |
return u'File:%s - %s.jpg' % (title, project) | |
def checkcountry(cats): | |
'''remove country category''' | |
for countrycat in cats: | |
if u'Countries' not in countrycat.title(withNamespace=False): | |
country = False | |
else: | |
country = True | |
break | |
return country | |
def checkyear(category): | |
'''remove year category''' | |
if category.isdigit(): | |
checkyear = True | |
else: | |
checkyear = False | |
return checkyear | |
def checkmetacategories(cats): | |
'''remove Meta categories''' | |
for metacat in cats: | |
if u'Meta categories' not in metacat.title(withNamespace=False) or metacat.title(withNamespace=False) <> u'Topics': | |
checkmeta = False | |
else: | |
checkmeta = True | |
break | |
return checkmeta | |
def checkmaincategory(categorypage): | |
'''remove main category''' | |
templates = categorypage.templates() | |
maincattemp=[u'MetaCat', u'CatDiffuse', u'Categorise', u'CatCat', u'Disambig'] | |
for temp in templates: | |
if temp.title(withNamespace=False) in maincattemp: | |
checkmain = True | |
break | |
else: | |
checkmain = False | |
return checkmain | |
def buildDescription(Information, site, tags, Location): | |
''' Build the final description for the image. | |
''' | |
uploder = config.usernames['commons']['commons'] | |
description = Information | |
catexists=len(tags) | |
if tags: | |
catexists=len(tags) | |
catstext = u'' | |
for category in tags: | |
try: | |
categorypage=pywikibot.Page(site, u'Category:' + category) | |
if categorypage.exists(): | |
if categorypage.isCategoryRedirect(): | |
cats=categorypage.getCategoryRedirectTarget().categories() | |
if checkcountry(cats) or checkyear(category) or checkmetacategories(cats) or checkmaincategory(categorypage): | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category) | |
else: | |
if u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' not in catstext: | |
catstext = catstext + u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' | |
pywikibot.output(u'RedirectTarget: '+categorypage.getCategoryRedirectTarget().title()) | |
else: | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category + '. Dupe') | |
else: | |
cats=categorypage.categories() | |
if checkcountry(cats) or checkyear(category) or checkmetacategories(cats) or checkmaincategory(categorypage): | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category) | |
else: | |
catstext = catstext + u'[[Category:' + category + ']]\n' | |
else: | |
catexists = catexists -1 | |
pywikibot.output(u'[[Category:' + category + ']]' + u' not exists.') | |
except: | |
catexists = catexists -1 | |
if catexists == 0: | |
if Location ==u'': | |
description = description + u'{{subst:unc}}\n' | |
else: | |
description = description + u'{{subst:unc|geo=1}}\n' | |
else: | |
description = description+u'{{subst:chc}}\n\n' + catstext | |
else: | |
if Location ==u'': | |
description = description + u'{{subst:unc}}\n' | |
else: | |
description = description + u'{{subst:unc|geo=1}}\n' | |
uploder = config.usernames['commons']['commons'] | |
description = description + u'[[Category:Panoramio files uploaded by ' + uploder + ']]\n' | |
return description | |
def downloadPhoto(photoUrl = ''): | |
''' | |
Download the photo and store it in a StrinIO.StringIO object. | |
TODO: Add exception handling | |
''' | |
imageFile=urllib2.urlopen(photoUrl).read() | |
return StringIO.StringIO(imageFile) | |
def findDuplicateImages(photo=None, | |
site=pywikibot.Site(u'commons', u'commons')): | |
''' Takes the photo, calculates the SHA1 hash and asks the mediawiki api | |
for a list of duplicates. | |
TODO: Add exception handling, fix site thing | |
''' | |
hashObject = hashlib.sha1() | |
hashObject.update(photo.getvalue()) | |
return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) | |
start = time.time() | |
add=5 | |
page = pywikibot.Page(site, u"User:Panoramio upload bot/status") | |
text = page.text | |
#r=int(text)+1 | |
r=int(text) | |
for arg in pywikibot.handleArgs(): | |
if arg: | |
if arg.startswith('-start:'): | |
r = int(arg[7:]) | |
elif arg.startswith('-range:'): | |
add = int(arg[7:]) | |
def runputbot(i): | |
#for i in range(r,r+add): | |
site = pywikibot.Site(u'commons', u'commons') | |
url="http://www.panoramio.com/photo/%d" % i | |
try: | |
page = urllib2.urlopen(url) | |
urlverify=True | |
except (urllib2.HTTPError, urllib2.URLError), e: | |
urlverify=False | |
print 'photo id %d' % i, e | |
#sec=random.randint(1, 5) | |
#pywikibot.output(u"Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
while urlverify: | |
soup = BeautifulSoup(page) | |
license = soup.find('li', attrs={'class' : re.compile("^license")})['class'] | |
#print license | |
if license=="license by-sa" or license== "license by": | |
pywikibot.output('photo id %d %s is OK! Ready upload Commons...' % (i, license)) | |
photo_url = "http://static.panoramio.com/photos/original/%d.jpg" % i | |
#print photo_url | |
#Should download the photo only once | |
trying = True | |
n=0 | |
while trying: | |
try: | |
photo = downloadPhoto(photo_url) | |
trying=False | |
except (urllib2.HTTPError,urllib2.URLError), e: | |
pywikibot.output(e.code + ' try....') | |
n=n+1 | |
time.sleep(5*n*n) | |
#Don't upload duplicate images, should add override option | |
duplicates = findDuplicateImages(photo) | |
if duplicates: | |
pywikibot.output(u'Found duplicate image at %s' % duplicates.pop()) | |
urlverify=False | |
else: | |
#photo title | |
title=soup.find("h1", { "id" : "photo-title" }).contents[0].strip().lstrip().rstrip(',') | |
#author and author _url | |
author=soup.find("a", { "rel" : "author" }).contents[0] | |
author_url = soup.find("a", { "rel" : "author" })['href'] | |
author_url= 'http://www.panoramio.com' + author_url | |
#clean filename | |
filename=cleanUpTitle(title, site, author) | |
pywikibot.output(filename,toStdout=True) | |
#print str(filename.decode('utf-8')) | |
#tags | |
tags=[] | |
for tag in soup.findAll(attrs={'id' : re.compile("^tag_element")}): | |
tags.append(tag.a.contents[0].strip().lstrip().rstrip(',')) | |
try: | |
mapname=soup.find("div", { "id" : "map_info_name" }).a.contents[0] | |
if mapname not in tags: | |
tags.append(mapname) | |
print tags | |
except AttributeError: | |
print tags | |
#date | |
try: | |
date=soup.find("li", { "id" : "tech-details" }).findNext('ul').find(text=re.compile("^Taken on")) | |
formatdate=datetime.datetime.strptime(date,'Taken on %Y/%m/%d %X') | |
date = formatdate.strftime('%Y-%m-%d') | |
date=u'{{Taken on|%s}}' % date | |
print date | |
except (AttributeError, TypeError): | |
date=soup.find("ul", { "id" : "details" }).li.contents[0].strip().lstrip().rstrip(',') | |
formatdate=datetime.datetime.strptime(date,'Uploaded on %B %d, %Y') | |
date = formatdate.strftime('%Y-%m-%d') | |
date =u'{{Original upload date|%s}}' % date | |
print 'take on not found.', date | |
#geo | |
try: | |
lat =soup.find("abbr", { "class" : "latitude" })['title'] | |
lon = soup.find("abbr", { "class" : "longitude" })['title'] | |
Location = u'{{Location|%s|%s|source:Panoramio}}' %(lat, lon) | |
except TypeError: | |
Location = u'' | |
if license == "license by-sa": | |
licensetag = u'{{cc-by-sa-3.0|%s}}' % author | |
elif license == "license by": | |
licensetag = u'{{cc-by-3.0|%s}}' % author | |
reviewer = config.usernames['commons']['commons'] | |
review = u'{{Panoramioreview|%s|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}' % reviewer | |
#print soup.find("div", { "id" : "photo-description-formatted" }).contents[0] | |
try: | |
photo_description = u''.join(unicode(item) for item in soup.find("div", { "id" : "photo-description-formatted" }).contents).strip().lstrip().rstrip(',') | |
if photo_description: | |
title=title+u'\n\n'+ photo_description | |
else: | |
pass | |
except AttributeError: | |
pass | |
Information=u'{{Information\n|description=%s\n|date=%s\n|source=%s\n|author=[%s %s]\n|permission=%s\n%s\n|other_versions=\n|other_fields=\n}}\n%s\n\n' % (title, date, url, author_url, author, licensetag, review, Location) | |
#site = pywikibot.Site(u'commons', u'commons') | |
Description = buildDescription(Information, site, tags, Location) | |
#pywikibot.output(Description) | |
bot = upload.UploadRobot(photo_url, | |
description=Description, | |
useFilename=filename, | |
keepFilename=True, | |
verifyDescription=False, | |
ignoreWarning=True,uploadByUrl=True) | |
uploadtoo = True | |
n=0 | |
while uploadtoo: | |
try: | |
bot.upload_image(debug=True) | |
urlverify=False | |
uploadtoo = False | |
#sec=random.randint(1, 10) | |
#pywikibot.output(u"Finished upload. Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
except e: | |
n=n+1 | |
print e.code | |
#time.sleep(5*n*n) | |
else: | |
pywikibot.output('photo id %d %s is invalid! Ignore...' % (i, license)) | |
#sec=random.randint(1, 5) | |
#pywikibot.output(u"Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
urlverify=False | |
# Make the Pool of workers | |
pool = ThreadPool(4) | |
pool.map(runputbot, range(r,r+add)) | |
#close the pool and wait for the work to finish | |
pool.close() | |
pool.join() | |
print "Elapsed Time: %s" % (time.time() - start) | |
#update upload status | |
i=r+add | |
statuspage = pywikibot.Page(site, u"User:Panoramio upload bot/status") | |
statuspage.text = u'%d' % i | |
statuspage.save(u"update upload status: %d" % i) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment