Skip to content

Instantly share code, notes, and snippets.

@shizhao
Last active July 30, 2021 01:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save shizhao/cdc442580bdba6b63582e8511d9162cf to your computer and use it in GitHub Desktop.
Save shizhao/cdc442580bdba6b63582e8511d9162cf to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import datetime
import urllib2,re, random
import scripts.upload,time, StringIO, hashlib, base64
import tempfile
import pywikibot
import pywikibot.data.api
from pywikibot import config
import sys
reload(sys) # Python2.5 初始化后会删陿sys.setdefaultencoding 这个方法,我们需要重新载兿
sys.setdefaultencoding('utf-8')
site = pywikibot.Site(u'commons', u'commons')
def cleanUpTitle(title, site, author, project = u'panoramio'):
''' Clean up the title of a potential mediawiki page. Otherwise the title of
the page might not be allowed by the software.
from flickrripper.py
'''
maxBytes = 240 - len(project.encode('utf-8')) \
- len(author.encode('utf-8'))
titleBytes = len(title.encode('utf-8'))
if titleBytes > maxBytes:
# maybe we cut more than needed, anyway we do it
items = max(min(len(title), maxBytes / 4),
len(title) - titleBytes + maxBytes)
title = title[:items]
title = title.strip()
title = re.sub(u"^ $", u"from panoramio", title)
title = re.sub(u"[<{\\[]", u"(", title)
title = re.sub(u"[>}\\]]", u")", title)
title = re.sub(u"[ _]?\\(!\\)", u"", title)
title = re.sub(u",:[ _]", u", ", title)
title = re.sub(u"[;:][ _]", u", ", title)
title = re.sub(u"[\t\n ]+", u" ", title)
title = re.sub(u"[\r\n ]+", u" ", title)
title = re.sub(u"[\n]+", u"", title)
title = re.sub(u"[?!]([.\"]|$)", u"\\1", title)
title = re.sub(u"[&#%?!*]", u"^", title)
title = re.sub(u"[;]", u",", title)
title = re.sub(u"[/+\\\\:]", u"-", title)
title = re.sub(u"--+", u"-", title)
title = re.sub(u",,+", u",", title)
title = re.sub(u"[-,^]([.]|$)", u"\\1", title)
title = title.replace(u" ", u"_")
title = title.replace(u"|", u"_")
if not site:
site = pywikibot.Site(u'commons', u'commons')
if pywikibot.Page(site, u'File:%s - %s.jpg' % (title, project)).exists():
i = 1
while True:
if pywikibot.Page(site, u'File:%s - %s (%d).jpg' % (title, project, i)).exists():
i += 1
else:
return u'%s - %s (%d).jpg' % (title, project, i)
else:
return u'%s - %s.jpg' % (title, project)
#def checkcountry(cats):
'''remove country category'''
# for countrycat in cats:
# if u'Countries' not in countrycat.title(withNamespace=False):
# country = False
# else:
# country = True
# break
# return country
def checkyear(category):
'''remove year category'''
if category.isdigit():
checkyear = True
else:
checkyear = False
return checkyear
#def checkmetacategories(cats):
'''remove Meta categories'''
# for metacat in cats:
# if u'Meta categories' not in metacat.title(withNamespace=False) or metacat.title(withNamespace=False) <> u'Topics':
# checkmeta = False
# else:
# checkmeta = True
# break
# return checkmeta
def blackcategories(cats):
'''remove Meta categories'''
for blackcat in cats:
if blackcat.title(withNamespace=False) not in [u'Creativity', u'Meta categories', u'Categories requiring temporary diffusion',u'Categories requiring permanent diffusion', u'Categories requiring permanent diffusion to zero', u'Disambiguation', u'Categories for discussion', u'Hidden categories', u'Topics', u'CommonsRoot', u'Categories', u'Idukki district']:
checkblack = False
else:
checkblack = True
break
return checkblack
#def checkmaincategory(categorypage):
'''remove main category'''
# templates = categorypage.templates()
# maincattemp=[u'MetaCat', u'CatDiffuse', u'Categorise', u'CatCat', u'Disambig']
# for temp in templates:
# if temp.title(withNamespace=False) not in maincattemp:
# checkmain = False
# else:
# checkmain = True
# break
# return checkmain
def buildDescription(Information, site, tags, Location):
''' Build the final description for the image.
'''
uploder = config.usernames['commons']['commons']
description = Information
catexists=len(tags)
if tags:
catexists=len(tags)
catstext = u''
for category in tags:
try:
categorypage=pywikibot.Page(site, u'Category:' + category)
if categorypage.exists():
if categorypage.isCategoryRedirect():
cats=categorypage.getCategoryRedirectTarget().categories()
if checkyear(category) or blackcategories(cats):
catexists = catexists -1
pywikibot.output(u'remove category: ' + category)
else:
if u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' not in catstext:
catstext = catstext + u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n'
pywikibot.output(u'RedirectTarget: '+categorypage.getCategoryRedirectTarget().title())
else:
catexists = catexists -1
pywikibot.output(u'remove category: ' + category + '. Dupe')
else:
cats=categorypage.categories()
if checkyear(category) or blackcategories(cats):
catexists = catexists -1
pywikibot.output(u'remove category: ' + category)
else:
catstext = catstext + u'[[Category:' + category + ']]\n'
else:
catexists = catexists -1
pywikibot.output(u'[[Category:' + category + ']]' + u' not exists.')
except:
catexists = catexists -1
if catexists == 0:
if Location ==u'':
description = description + u'{{subst:unc}}\n'
else:
description = description + u'{{subst:unc|geo=1}}\n'
else:
description = description+u'{{subst:chc}}\n\n' + catstext
else:
if Location ==u'':
description = description + u'{{subst:unc}}\n'
else:
description = description + u'{{subst:unc|geo=1}}\n'
uploder = config.usernames['commons']['commons']
description = description + u'[[Category:Panoramio files uploaded by ' + uploder + ']]\n'
return description
def downloadPhoto(photoUrl = ''):
'''
Download the photo and store it in a StrinIO.StringIO object.
TODO: Add exception handling
'''
imageFile=urllib2.urlopen(photoUrl).read()
return StringIO.StringIO(imageFile)
def findDuplicateImages(photo=None,
site=pywikibot.Site(u'commons', u'commons')):
''' Takes the photo, calculates the SHA1 hash and asks the mediawiki api
for a list of duplicates.
TODO: Add exception handling, fix site thing
'''
hashObject = hashlib.sha1()
hashObject.update(photo.getvalue())
return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
def imagesize(photo):
return photo.len
def runputbot(jl):
#for i in range(r,r+add):
site = pywikibot.Site(u'commons', u'commons')
#url="http://www.panoramio.com/photo/%d" % i
#try:
# page = urllib2.urlopen(url)
# urlverify=True
#except:
# urlverify=False
# print 'photo id %d' % i
#sec=random.randint(1, 5)
#pywikibot.output(u"Waiting for %d seconds." % sec)
#time.sleep(sec)
#while urlverify:
#soup = BeautifulSoup(page)
license = jl[u'license'][0]
#print license
i = jl[u'photoid'][0]
if license=="license by-sa" or license== "license by":
pywikibot.output('photo id %s %s is OK! Ready upload Commons...' % (i, license))
photo_url = "http://static.panoramio.com/photos/original/%s.jpg" % i
#print photo_url
#Should download the photo only once
trying = 1
n=0
while trying:
try:
photo = downloadPhoto(photo_url)
trying=0
except:
pywikibot.output(' try....')
n=n+1
time.sleep(n*n)
if n>2:
trying=0
#Don't upload duplicate images, should add override option
if jl[u'lat'][0] and jl[u'lon'][0]:
duplicates = findDuplicateImages(photo)
photosize = imagesize(photo)
if duplicates:
pywikibot.output(u'Found duplicate image at %s' % duplicates.pop())
#urlverify=False
elif photosize > 250*1024:
pywikibot.output(u'phpto size: %d' % photosize)
#photo title
title=jl[u'title'][0].strip().lstrip().rstrip(',')
#author and author _url
author=jl[u'author'][0].strip().lstrip().rstrip(',')
author_url = jl[u'author_url'][0]
author_url= 'http://www.panoramio.com' + author_url
#clean filename
filename=cleanUpTitle(title, site, author)
#pywikibot.output(filename,toStdout=True)
#print str(filename.decode('utf-8'))
#tags
tags=jl[u'tags']
#for tag in jl[u'tags']:
# tags.append(tag.a.contents[0].strip().lstrip().rstrip(','))
#try:
# mapname=soup.find("div", { "id" : "map_info_name" }).a.contents[0]
# if mapname not in tags:
# tags.append(mapname)
# print tags
#except:
# print tags
#date
try:
date=jl[u'take_date'][0]
formatdate=datetime.datetime.strptime(date,'%Y/%m/%d %X')
date = formatdate.strftime('%Y-%m-%d')
date=u'{{Taken on|%s}}' % date
#print date
except:
date=jl[u'upload_date'][0]
formatdate=datetime.datetime.strptime(date,'%B %d, %Y')
date = formatdate.strftime('%Y-%m-%d')
date =u'{{Original upload date|%s}}' % date
pywikibot.output('take on not found. Upload on %s' % date)
#geo
try:
lat =jl[u'lat'][0]
lon = jl[u'lon'][0]
Location = u'{{Location|%s|%s|source:Panoramio}}' %(lat, lon)
except:
Location = u''
if license == "license by-sa":
licensetag = u'{{cc-by-sa-3.0|%s}}' % author
elif license == "license by":
licensetag = u'{{cc-by-3.0|%s}}' % author
reviewer = config.usernames['commons']['commons']
review = u'{{Panoramioreview|%s|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}' % reviewer
#print soup.find("div", { "id" : "photo-description-formatted" }).contents[0]
try:
photo_description = jl[u'desc'][0].strip().lstrip().rstrip(',')
if photo_description:
title=title+u'\n\n'+ photo_description
else:
pass
except:
pass
url="http://www.panoramio.com/photo/%s" % i
if tags:
strtags = u'</code>, <code>'.join(tags)
strtags = u'{{Information field|Name=Tags<br />(from Panoramio photo page)|Value=<code>%s</code>}}' % strtags
else:
strtags =u''
Information=u'== {{int:filedesc}} ==\n{{Information\n|description=%s\n|date=%s\n|source=%s\n|author=[%s %s]\n|permission=%s\n%s\n|other_versions=\n|other_fields=%s\n}}\n%s\n\n' % (title, date, url, author_url, author, licensetag, review, strtags, Location)
#site = pywikibot.Site(u'commons', u'commons')
Description = buildDescription(Information, site, tags, Location)
#pywikibot.output(Description)
bot = scripts.upload.UploadRobot(photo_url,
description=Description,
useFilename=filename,
keepFilename=True,
verifyDescription=False,
ignoreWarning=True,uploadByUrl=True)
uploadtoo = 1
n=0
while uploadtoo:
try:
bot.upload_image(debug=True)
#urlverify=False
uploadtoo = 0
#sec=random.randint(1, 10)
#pywikibot.output(u"Finished upload. Waiting for %d seconds." % sec)
#time.sleep(sec)
print "id =", i
except:
n=n+1
#print e.code
if n <3:
time.sleep(n*n)
else:
uploadtoo = 0
print "id =", i
else:
pywikibot.output(u'photo %s too small, size is %d' %(i, photosize))
else:
pywikibot.output(u'photo %s not geo info' %i)
else:
pywikibot.output('photo id %s %s is invalid! Ignore...' % (i, license))
#sec=random.randint(1, 5)
#pywikibot.output(u"Waiting for %d seconds." % sec)
#time.sleep(sec)
#urlverify=False
start = time.time()
#add=5
#page = pywikibot.Page(site, u"User:Panoramio upload bot/status")
#text = page.text
#r=int(text)+1
#r=int(text)
for arg in pywikibot.handleArgs():
if arg:
if arg.startswith('-jl:'):
jl = arg[4:]
# elif arg.startswith('-range:'):
# add = int(arg[7:])
with open(jl, 'r') as f:
for jsonline in f:
jl=json.loads(jsonline)
try:
runputbot(jl)
except Exception, ex:
pywikibot.output("%s: %s" % (Exception,ex))
print "Elapsed Time: %s" % (time.time() - start)
#update upload status
#i=r+add
#statuspage = pywikibot.Page(site, u"User:Panoramio upload bot/status")
#statuspage.text = u'%d' % i
#statuspage.save(u"update upload status: %d" % i)
@Money3Trap
Copy link

You

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment