-
-
Save Steinsplitter/e115799e7fc111b71a9135fb7ea72e67 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
upload images of panoramio to wikimedia commons. | |
jl files from https://tools.wmflabs.org/pub/ | |
""" | |
import json | |
import datetime | |
import urllib2,re, random | |
import upload,time, StringIO, hashlib, base64 | |
import tempfile | |
import pywikibot | |
import pywikibot.data.api | |
from pywikibot import config | |
import sys | |
reload(sys) # Python2.5 初始化后会删除 sys.setdefaultencoding 这个方法,我们需要重新载入 | |
sys.setdefaultencoding('utf-8') | |
site = pywikibot.Site(u'commons', u'commons') | |
def cleanUpTitle(title, site, author, project = u'panoramio'): | |
''' Clean up the title of a potential mediawiki page. Otherwise the title of | |
the page might not be allowed by the software. | |
from flickrripper.py | |
''' | |
maxBytes = 240 - len(project.encode('utf-8')) \ | |
- len(author.encode('utf-8')) | |
titleBytes = len(title.encode('utf-8')) | |
if titleBytes > maxBytes: | |
# maybe we cut more than needed, anyway we do it | |
items = max(min(len(title), maxBytes / 4), | |
len(title) - titleBytes + maxBytes) | |
title = title[:items] | |
title = title.strip() | |
title = re.sub(u"^ $", u"Untitled", title) | |
title = re.sub(u"[<{\\[]", u"(", title) | |
title = re.sub(u"[>}\\]]", u")", title) | |
title = re.sub(u"[ _]?\\(!\\)", u"", title) | |
title = re.sub(u",:[ _]", u", ", title) | |
title = re.sub(u"[;:][ _]", u", ", title) | |
title = re.sub(u"[\t\n ]+", u" ", title) | |
title = re.sub(u"[\r\n ]+", u" ", title) | |
title = re.sub(u"[\n]+", u"", title) | |
title = re.sub(u"[?!]([.\"]|$)", u"\\1", title) | |
title = re.sub(u"[&#%?!]", u"^", title) | |
title = re.sub(u"[;]", u",", title) | |
title = re.sub(u"[/+\\\\:]", u"-", title) | |
title = re.sub(u"--+", u"-", title) | |
title = re.sub(u",,+", u",", title) | |
title = re.sub(u"[-,^]([.]|$)", u"\\1", title) | |
title = title.replace(u" ", u"_") | |
title = title.replace(u"|", u"_") | |
if not site: | |
site = pywikibot.Site(u'commons', u'commons') | |
try: | |
if pywikibot.Page(site, u'File:%s - %s.jpg' % (title, project)).exists(): | |
while True: | |
if pywikibot.Page(site, u'File:%s - %s - %s.jpg' % (title, project, author)).exists(): | |
i = 1 | |
while True: | |
if pywikibot.Page(site, u'File:%s - %s - %s (%d).jpg' % (title, project, author, i)).exists(): | |
i += 1 | |
else: | |
return u'%s - %s - %s (%d).jpg' % (title, project, author, i) | |
else: | |
return u'%s - %s - %s.jpg' % (title, project, author) | |
else: | |
return u'%s - %s.jpg' % (title, project) | |
except: | |
if pywikibot.Page(site, u'File:%s - %s.jpg' % (title, project)).exists(): | |
i = 1 | |
while True: | |
if pywikibot.Page(site, u'File:%s - %s (%d).jpg' % (title, project, i)).exists(): | |
i += 1 | |
else: | |
return u'%s - %s (%d).jpg' % (title, project, i) | |
else: | |
return u'%s - %s.jpg' % (title, project) | |
def checkcountry(cats): | |
'''remove country category''' | |
for countrycat in cats: | |
if u'Countries' not in countrycat.title(withNamespace=False): | |
country = False | |
else: | |
country = True | |
break | |
return country | |
def checkyear(category): | |
'''remove year category''' | |
if category.isdigit(): | |
checkyear = True | |
else: | |
checkyear = False | |
return checkyear | |
def checkmetacategories(cats): | |
'''remove Meta categories''' | |
for metacat in cats: | |
if u'Meta categories' not in metacat.title(withNamespace=False) or metacat.title(withNamespace=False) <> u'Topics': | |
checkmeta = False | |
else: | |
checkmeta = True | |
break | |
return checkmeta | |
def blackcategories(cats): | |
'''remove Meta categories''' | |
for blackcat in cats: | |
if blackcat.title(withNamespace=False) not in [u'Creativity', u'Meta categories'u'Categories requiring temporary diffusion',u'Categories requiring permanent diffusion', u'Categories requiring permanent diffusion to zero', u'Disambiguation']: | |
checkblack = False | |
else: | |
checkblack = True | |
break | |
return checkblack | |
def checkmaincategory(categorypage): | |
'''remove main category''' | |
templates = categorypage.templates() | |
maincattemp=[u'MetaCat', u'CatDiffuse', u'Categorise', u'CatCat', u'Disambig'] | |
for temp in templates: | |
if temp.title(withNamespace=False) not in maincattemp: | |
checkmain = False | |
else: | |
checkmain = True | |
break | |
return checkmain | |
def buildDescription(Information, site, tags, Location): | |
''' Build the final description for the image. | |
''' | |
uploder = config.usernames['commons']['commons'] | |
description = Information | |
catexists=len(tags) | |
if tags: | |
catexists=len(tags) | |
catstext = u'' | |
for category in tags: | |
try: | |
categorypage=pywikibot.Page(site, u'Category:' + category) | |
if categorypage.exists(): | |
if categorypage.isCategoryRedirect(): | |
cats=categorypage.getCategoryRedirectTarget().categories() | |
if checkcountry(cats) or checkyear(category) or checkmetacategories(cats) or checkmaincategory(categorypage) or blackcategories(cats): | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category) | |
else: | |
if u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' not in catstext: | |
catstext = catstext + u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' | |
pywikibot.output(u'RedirectTarget: '+categorypage.getCategoryRedirectTarget().title()) | |
else: | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category + '. Dupe') | |
else: | |
cats=categorypage.categories() | |
if checkcountry(cats) or checkyear(category) or checkmetacategories(cats) or checkmaincategory(categorypage) or blackcategories(cats): | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category) | |
else: | |
catstext = catstext + u'[[Category:' + category + ']]\n' | |
else: | |
catexists = catexists -1 | |
pywikibot.output(u'[[Category:' + category + ']]' + u' not exists.') | |
except: | |
catexists = catexists -1 | |
if catexists == 0: | |
if Location ==u'': | |
description = description + u'{{subst:unc}}\n' | |
else: | |
description = description + u'{{subst:unc|geo=1}}\n' | |
else: | |
description = description+u'{{subst:chc}}\n\n' + catstext | |
else: | |
if Location ==u'': | |
description = description + u'{{subst:unc}}\n' | |
else: | |
description = description + u'{{subst:unc|geo=1}}\n' | |
uploder = config.usernames['commons']['commons'] | |
description = description + u'[[Category:Panoramio files uploaded by ' + uploder + ']]\n' | |
return description | |
def downloadPhoto(photoUrl = ''): | |
''' | |
Download the photo and store it in a StrinIO.StringIO object. | |
TODO: Add exception handling | |
''' | |
imageFile=urllib2.urlopen(photoUrl).read() | |
return StringIO.StringIO(imageFile) | |
def findDuplicateImages(photo=None, | |
site=pywikibot.Site(u'commons', u'commons')): | |
''' Takes the photo, calculates the SHA1 hash and asks the mediawiki api | |
for a list of duplicates. | |
TODO: Add exception handling, fix site thing | |
''' | |
hashObject = hashlib.sha1() | |
hashObject.update(photo.getvalue()) | |
return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) | |
def runputbot(jl): | |
#for i in range(r,r+add): | |
site = pywikibot.Site(u'commons', u'commons') | |
#url="http://www.panoramio.com/photo/%d" % i | |
#try: | |
# page = urllib2.urlopen(url) | |
# urlverify=True | |
#except: | |
# urlverify=False | |
# print 'photo id %d' % i | |
#sec=random.randint(1, 5) | |
#pywikibot.output(u"Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
#while urlverify: | |
#soup = BeautifulSoup(page) | |
license = jl[u'license'][0] | |
#print license | |
i = jl[u'photoid'][0] | |
if license=="license by-sa" or license== "license by": | |
pywikibot.output('photo id %s %s is OK! Ready upload Commons...' % (i, license)) | |
photo_url = "http://static.panoramio.com/photos/original/%s.jpg" % i | |
#print photo_url | |
#Should download the photo only once | |
trying = 1 | |
n=0 | |
while trying: | |
try: | |
photo = downloadPhoto(photo_url) | |
trying=0 | |
except: | |
pywikibot.output(' try....') | |
n=n+1 | |
time.sleep(n*n) | |
if n>2: | |
trying=0 | |
#Don't upload duplicate images, should add override option | |
duplicates = findDuplicateImages(photo) | |
if duplicates: | |
pywikibot.output(u'Found duplicate image at %s' % duplicates.pop()) | |
#urlverify=False | |
else: | |
#photo title | |
title=jl[u'title'][0].strip().lstrip().rstrip(',') | |
#author and author _url | |
author=jl[u'author'][0].strip().lstrip().rstrip(',') | |
author_url = jl[u'author_url'][0] | |
author_url= 'http://www.panoramio.com' + author_url | |
#clean filename | |
filename=cleanUpTitle(title, site, author) | |
#pywikibot.output(filename,toStdout=True) | |
#print str(filename.decode('utf-8')) | |
#tags | |
tags=jl[u'tags'] | |
#for tag in jl[u'tags']: | |
# tags.append(tag.a.contents[0].strip().lstrip().rstrip(',')) | |
#try: | |
# mapname=soup.find("div", { "id" : "map_info_name" }).a.contents[0] | |
# if mapname not in tags: | |
# tags.append(mapname) | |
# print tags | |
#except: | |
# print tags | |
#date | |
try: | |
date=jl[u'take_date'][0] | |
formatdate=datetime.datetime.strptime(date,'%Y/%m/%d %X') | |
date = formatdate.strftime('%Y-%m-%d') | |
date=u'{{Taken on|%s}}' % date | |
print date | |
except: | |
date=jl[u'upload_date'][0] | |
formatdate=datetime.datetime.strptime(date,'%B %d, %Y') | |
date = formatdate.strftime('%Y-%m-%d') | |
date =u'{{Original upload date|%s}}' % date | |
print 'take on not found. Upload on', date | |
#geo | |
try: | |
lat =jl[u'lat'][0] | |
lon = jl[u'lon'][0] | |
Location = u'{{Location|%s|%s|source:Panoramio}}' %(lat, lon) | |
except: | |
Location = u'' | |
if license == "license by-sa": | |
licensetag = u'{{cc-by-sa-3.0|%s}}' % author | |
elif license == "license by": | |
licensetag = u'{{cc-by-3.0|%s}}' % author | |
reviewerraw = config.usernames['commons']['commons'] | |
reviewer = reviewerraw.replace("_", " ") | |
review = u'{{Panoramioreview|%s|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}' % reviewer | |
#print soup.find("div", { "id" : "photo-description-formatted" }).contents[0] | |
try: | |
photo_description = jl[u'desc'][0].strip().lstrip().rstrip(',') | |
if photo_description: | |
title=title+u'\n\n'+ photo_description | |
else: | |
pass | |
except: | |
pass | |
url="http://www.panoramio.com/photo/%s" % i | |
Information=u'{{Information\n|description=%s\n|date=%s\n|source=%s\n|author=[%s %s]\n|permission=%s\n%s\n|other_versions=\n|other_fields=\n}}\n%s\n\n' % (title, date, url, author_url, author, licensetag, review, Location) | |
#site = pywikibot.Site(u'commons', u'commons') | |
Description = buildDescription(Information, site, tags, Location) | |
#pywikibot.output(Description) | |
bot = upload.UploadRobot(photo_url, | |
description=Description, | |
useFilename=filename, | |
keepFilename=True, | |
verifyDescription=False, | |
ignoreWarning=True,uploadByUrl=True) | |
uploadtoo = 1 | |
n=0 | |
while uploadtoo: | |
try: | |
bot.upload_image(debug=True) | |
#urlverify=False | |
uploadtoo = 0 | |
#sec=random.randint(1, 10) | |
#pywikibot.output(u"Finished upload. Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
print "id =", i | |
except: | |
n=n+1 | |
#print e.code | |
if n <3: | |
time.sleep(n*n) | |
else: | |
uploadtoo = 0 | |
print "id =", i | |
else: | |
pywikibot.output('photo id %s %s is invalid! Ignore...' % (i, license)) | |
#sec=random.randint(1, 5) | |
#pywikibot.output(u"Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
#urlverify=False | |
start = time.time() | |
#add=5 | |
#page = pywikibot.Page(site, u"User:Panoramio upload bot/status") | |
#text = page.text | |
#r=int(text)+1 | |
#r=int(text) | |
for arg in pywikibot.handleArgs(): | |
if arg: | |
if arg.startswith('-jl:'): | |
jl = arg[4:] | |
# elif arg.startswith('-range:'): | |
# add = int(arg[7:]) | |
with open(jl, 'r') as f: | |
for jsonline in f: | |
jl=json.loads(jsonline) | |
try: | |
runputbot(jl) | |
except: | |
print "error" | |
print "Elapsed Time: %s" % (time.time() - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment