Last active
July 30, 2021 01:14
-
-
Save shizhao/cdc442580bdba6b63582e8511d9162cf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import json | |
import datetime | |
import urllib2,re, random | |
import scripts.upload,time, StringIO, hashlib, base64 | |
import tempfile | |
import pywikibot | |
import pywikibot.data.api | |
from pywikibot import config | |
import sys | |
reload(sys) # Python2.5 初始化后会删陿sys.setdefaultencoding 这个方法,我们需要重新载兿 | |
sys.setdefaultencoding('utf-8') | |
site = pywikibot.Site(u'commons', u'commons') | |
def cleanUpTitle(title, site, author, project = u'panoramio'): | |
''' Clean up the title of a potential mediawiki page. Otherwise the title of | |
the page might not be allowed by the software. | |
from flickrripper.py | |
''' | |
maxBytes = 240 - len(project.encode('utf-8')) \ | |
- len(author.encode('utf-8')) | |
titleBytes = len(title.encode('utf-8')) | |
if titleBytes > maxBytes: | |
# maybe we cut more than needed, anyway we do it | |
items = max(min(len(title), maxBytes / 4), | |
len(title) - titleBytes + maxBytes) | |
title = title[:items] | |
title = title.strip() | |
title = re.sub(u"^ $", u"from panoramio", title) | |
title = re.sub(u"[<{\\[]", u"(", title) | |
title = re.sub(u"[>}\\]]", u")", title) | |
title = re.sub(u"[ _]?\\(!\\)", u"", title) | |
title = re.sub(u",:[ _]", u", ", title) | |
title = re.sub(u"[;:][ _]", u", ", title) | |
title = re.sub(u"[\t\n ]+", u" ", title) | |
title = re.sub(u"[\r\n ]+", u" ", title) | |
title = re.sub(u"[\n]+", u"", title) | |
title = re.sub(u"[?!]([.\"]|$)", u"\\1", title) | |
title = re.sub(u"[&#%?!*]", u"^", title) | |
title = re.sub(u"[;]", u",", title) | |
title = re.sub(u"[/+\\\\:]", u"-", title) | |
title = re.sub(u"--+", u"-", title) | |
title = re.sub(u",,+", u",", title) | |
title = re.sub(u"[-,^]([.]|$)", u"\\1", title) | |
title = title.replace(u" ", u"_") | |
title = title.replace(u"|", u"_") | |
if not site: | |
site = pywikibot.Site(u'commons', u'commons') | |
if pywikibot.Page(site, u'File:%s - %s.jpg' % (title, project)).exists(): | |
i = 1 | |
while True: | |
if pywikibot.Page(site, u'File:%s - %s (%d).jpg' % (title, project, i)).exists(): | |
i += 1 | |
else: | |
return u'%s - %s (%d).jpg' % (title, project, i) | |
else: | |
return u'%s - %s.jpg' % (title, project) | |
#def checkcountry(cats): | |
'''remove country category''' | |
# for countrycat in cats: | |
# if u'Countries' not in countrycat.title(withNamespace=False): | |
# country = False | |
# else: | |
# country = True | |
# break | |
# return country | |
def checkyear(category): | |
'''remove year category''' | |
if category.isdigit(): | |
checkyear = True | |
else: | |
checkyear = False | |
return checkyear | |
#def checkmetacategories(cats): | |
'''remove Meta categories''' | |
# for metacat in cats: | |
# if u'Meta categories' not in metacat.title(withNamespace=False) or metacat.title(withNamespace=False) <> u'Topics': | |
# checkmeta = False | |
# else: | |
# checkmeta = True | |
# break | |
# return checkmeta | |
def blackcategories(cats): | |
'''remove Meta categories''' | |
for blackcat in cats: | |
if blackcat.title(withNamespace=False) not in [u'Creativity', u'Meta categories', u'Categories requiring temporary diffusion',u'Categories requiring permanent diffusion', u'Categories requiring permanent diffusion to zero', u'Disambiguation', u'Categories for discussion', u'Hidden categories', u'Topics', u'CommonsRoot', u'Categories', u'Idukki district']: | |
checkblack = False | |
else: | |
checkblack = True | |
break | |
return checkblack | |
#def checkmaincategory(categorypage): | |
'''remove main category''' | |
# templates = categorypage.templates() | |
# maincattemp=[u'MetaCat', u'CatDiffuse', u'Categorise', u'CatCat', u'Disambig'] | |
# for temp in templates: | |
# if temp.title(withNamespace=False) not in maincattemp: | |
# checkmain = False | |
# else: | |
# checkmain = True | |
# break | |
# return checkmain | |
def buildDescription(Information, site, tags, Location): | |
''' Build the final description for the image. | |
''' | |
uploder = config.usernames['commons']['commons'] | |
description = Information | |
catexists=len(tags) | |
if tags: | |
catexists=len(tags) | |
catstext = u'' | |
for category in tags: | |
try: | |
categorypage=pywikibot.Page(site, u'Category:' + category) | |
if categorypage.exists(): | |
if categorypage.isCategoryRedirect(): | |
cats=categorypage.getCategoryRedirectTarget().categories() | |
if checkyear(category) or blackcategories(cats): | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category) | |
else: | |
if u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' not in catstext: | |
catstext = catstext + u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' | |
pywikibot.output(u'RedirectTarget: '+categorypage.getCategoryRedirectTarget().title()) | |
else: | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category + '. Dupe') | |
else: | |
cats=categorypage.categories() | |
if checkyear(category) or blackcategories(cats): | |
catexists = catexists -1 | |
pywikibot.output(u'remove category: ' + category) | |
else: | |
catstext = catstext + u'[[Category:' + category + ']]\n' | |
else: | |
catexists = catexists -1 | |
pywikibot.output(u'[[Category:' + category + ']]' + u' not exists.') | |
except: | |
catexists = catexists -1 | |
if catexists == 0: | |
if Location ==u'': | |
description = description + u'{{subst:unc}}\n' | |
else: | |
description = description + u'{{subst:unc|geo=1}}\n' | |
else: | |
description = description+u'{{subst:chc}}\n\n' + catstext | |
else: | |
if Location ==u'': | |
description = description + u'{{subst:unc}}\n' | |
else: | |
description = description + u'{{subst:unc|geo=1}}\n' | |
uploder = config.usernames['commons']['commons'] | |
description = description + u'[[Category:Panoramio files uploaded by ' + uploder + ']]\n' | |
return description | |
def downloadPhoto(photoUrl = ''): | |
''' | |
Download the photo and store it in a StrinIO.StringIO object. | |
TODO: Add exception handling | |
''' | |
imageFile=urllib2.urlopen(photoUrl).read() | |
return StringIO.StringIO(imageFile) | |
def findDuplicateImages(photo=None, | |
site=pywikibot.Site(u'commons', u'commons')): | |
''' Takes the photo, calculates the SHA1 hash and asks the mediawiki api | |
for a list of duplicates. | |
TODO: Add exception handling, fix site thing | |
''' | |
hashObject = hashlib.sha1() | |
hashObject.update(photo.getvalue()) | |
return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) | |
def imagesize(photo): | |
return photo.len | |
def runputbot(jl): | |
#for i in range(r,r+add): | |
site = pywikibot.Site(u'commons', u'commons') | |
#url="http://www.panoramio.com/photo/%d" % i | |
#try: | |
# page = urllib2.urlopen(url) | |
# urlverify=True | |
#except: | |
# urlverify=False | |
# print 'photo id %d' % i | |
#sec=random.randint(1, 5) | |
#pywikibot.output(u"Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
#while urlverify: | |
#soup = BeautifulSoup(page) | |
license = jl[u'license'][0] | |
#print license | |
i = jl[u'photoid'][0] | |
if license=="license by-sa" or license== "license by": | |
pywikibot.output('photo id %s %s is OK! Ready upload Commons...' % (i, license)) | |
photo_url = "http://static.panoramio.com/photos/original/%s.jpg" % i | |
#print photo_url | |
#Should download the photo only once | |
trying = 1 | |
n=0 | |
while trying: | |
try: | |
photo = downloadPhoto(photo_url) | |
trying=0 | |
except: | |
pywikibot.output(' try....') | |
n=n+1 | |
time.sleep(n*n) | |
if n>2: | |
trying=0 | |
#Don't upload duplicate images, should add override option | |
if jl[u'lat'][0] and jl[u'lon'][0]: | |
duplicates = findDuplicateImages(photo) | |
photosize = imagesize(photo) | |
if duplicates: | |
pywikibot.output(u'Found duplicate image at %s' % duplicates.pop()) | |
#urlverify=False | |
elif photosize > 250*1024: | |
pywikibot.output(u'phpto size: %d' % photosize) | |
#photo title | |
title=jl[u'title'][0].strip().lstrip().rstrip(',') | |
#author and author _url | |
author=jl[u'author'][0].strip().lstrip().rstrip(',') | |
author_url = jl[u'author_url'][0] | |
author_url= 'http://www.panoramio.com' + author_url | |
#clean filename | |
filename=cleanUpTitle(title, site, author) | |
#pywikibot.output(filename,toStdout=True) | |
#print str(filename.decode('utf-8')) | |
#tags | |
tags=jl[u'tags'] | |
#for tag in jl[u'tags']: | |
# tags.append(tag.a.contents[0].strip().lstrip().rstrip(',')) | |
#try: | |
# mapname=soup.find("div", { "id" : "map_info_name" }).a.contents[0] | |
# if mapname not in tags: | |
# tags.append(mapname) | |
# print tags | |
#except: | |
# print tags | |
#date | |
try: | |
date=jl[u'take_date'][0] | |
formatdate=datetime.datetime.strptime(date,'%Y/%m/%d %X') | |
date = formatdate.strftime('%Y-%m-%d') | |
date=u'{{Taken on|%s}}' % date | |
#print date | |
except: | |
date=jl[u'upload_date'][0] | |
formatdate=datetime.datetime.strptime(date,'%B %d, %Y') | |
date = formatdate.strftime('%Y-%m-%d') | |
date =u'{{Original upload date|%s}}' % date | |
pywikibot.output('take on not found. Upload on %s' % date) | |
#geo | |
try: | |
lat =jl[u'lat'][0] | |
lon = jl[u'lon'][0] | |
Location = u'{{Location|%s|%s|source:Panoramio}}' %(lat, lon) | |
except: | |
Location = u'' | |
if license == "license by-sa": | |
licensetag = u'{{cc-by-sa-3.0|%s}}' % author | |
elif license == "license by": | |
licensetag = u'{{cc-by-3.0|%s}}' % author | |
reviewer = config.usernames['commons']['commons'] | |
review = u'{{Panoramioreview|%s|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}' % reviewer | |
#print soup.find("div", { "id" : "photo-description-formatted" }).contents[0] | |
try: | |
photo_description = jl[u'desc'][0].strip().lstrip().rstrip(',') | |
if photo_description: | |
title=title+u'\n\n'+ photo_description | |
else: | |
pass | |
except: | |
pass | |
url="http://www.panoramio.com/photo/%s" % i | |
if tags: | |
strtags = u'</code>, <code>'.join(tags) | |
strtags = u'{{Information field|Name=Tags<br />(from Panoramio photo page)|Value=<code>%s</code>}}' % strtags | |
else: | |
strtags =u'' | |
Information=u'== {{int:filedesc}} ==\n{{Information\n|description=%s\n|date=%s\n|source=%s\n|author=[%s %s]\n|permission=%s\n%s\n|other_versions=\n|other_fields=%s\n}}\n%s\n\n' % (title, date, url, author_url, author, licensetag, review, strtags, Location) | |
#site = pywikibot.Site(u'commons', u'commons') | |
Description = buildDescription(Information, site, tags, Location) | |
#pywikibot.output(Description) | |
bot = scripts.upload.UploadRobot(photo_url, | |
description=Description, | |
useFilename=filename, | |
keepFilename=True, | |
verifyDescription=False, | |
ignoreWarning=True,uploadByUrl=True) | |
uploadtoo = 1 | |
n=0 | |
while uploadtoo: | |
try: | |
bot.upload_image(debug=True) | |
#urlverify=False | |
uploadtoo = 0 | |
#sec=random.randint(1, 10) | |
#pywikibot.output(u"Finished upload. Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
print "id =", i | |
except: | |
n=n+1 | |
#print e.code | |
if n <3: | |
time.sleep(n*n) | |
else: | |
uploadtoo = 0 | |
print "id =", i | |
else: | |
pywikibot.output(u'photo %s too small, size is %d' %(i, photosize)) | |
else: | |
pywikibot.output(u'photo %s not geo info' %i) | |
else: | |
pywikibot.output('photo id %s %s is invalid! Ignore...' % (i, license)) | |
#sec=random.randint(1, 5) | |
#pywikibot.output(u"Waiting for %d seconds." % sec) | |
#time.sleep(sec) | |
#urlverify=False | |
start = time.time() | |
#add=5 | |
#page = pywikibot.Page(site, u"User:Panoramio upload bot/status") | |
#text = page.text | |
#r=int(text)+1 | |
#r=int(text) | |
for arg in pywikibot.handleArgs(): | |
if arg: | |
if arg.startswith('-jl:'): | |
jl = arg[4:] | |
# elif arg.startswith('-range:'): | |
# add = int(arg[7:]) | |
with open(jl, 'r') as f: | |
for jsonline in f: | |
jl=json.loads(jsonline) | |
try: | |
runputbot(jl) | |
except Exception, ex: | |
pywikibot.output("%s: %s" % (Exception,ex)) | |
print "Elapsed Time: %s" % (time.time() - start) | |
#update upload status | |
#i=r+add | |
#statuspage = pywikibot.Page(site, u"User:Panoramio upload bot/status") | |
#statuspage.text = u'%d' % i | |
#statuspage.save(u"update upload status: %d" % i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://gist.github.com/Steinsplitter/e115799e7fc111b71a9135fb7ea72e67/revisions