Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Remove deleted images from mlwikipedia. List of deleted images (100) are taken from deletion log
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
നീക്കം ചെയ്ത പ്രമാണങ്ങൾ താളുകളിൽ നിന്നും ഒഴിവാക്കുന്നതിന്
നിർമ്മിച്ചത്: സുനിൽ വി.എസ്.
തിയതി: 2010-11-27
"""
import wikipedia
import pagegenerators
import unicodedata
import codecs
import re
import pywikibot
imageNamespaces = [ur'file',ur'File', ur'image', ur'Image', ur'ചിത്രം', ur'പ്രമാണം']
#to replace all whitespaces in text to underscore
def getGenericText(pText):
return pText.replace(' ','_')
#to remove imagetext from pagetext
def hideImageText(pText,pImageText,pOccurances):
for i in range(pOccurances):
myLocation=getGenericText(pText).find(pImageText)
myStartLocation=0
myEndLocation=0
#If it is from gallery
if pText[myLocation-1]=='\n':
wikipedia.output('Removing from Gallery')
myEndLocation=pText.find('\n',myLocation+1)
myStartLocation=myLocation
#If it is image inclusion
elif pText[myLocation-2:myLocation]=='[[':
wikipedia.output ('Removing image')
myLPCount=0 #Left paranthesis count ( [ )
myRPCount=0 #Right paranthesis count ( ] )
j=2
while True:
if pText[myLocation+j]=='[':
myLPCount+=1
if pText[myLocation+j]==']':
myRPCount+=1
j+=1
if myRPCount-myLPCount==2:
break
myEndLocation=myLocation+j
myStartLocation=myLocation-2
wikipedia.output(pText[myLocation-2:myEndLocation])
myString=pText[myStartLocation:myEndLocation]
myStringPart1=pText[0:myStartLocation]
myStringPart2=pText[myEndLocation+1:]
pText=myStringPart1+'<!--' + myString + '\n-->\n' + myStringPart2
return pText
#main program starts here
siteFamily = 'wikipedia'
siteLangCode = 'ml'
wikiSite = wikipedia.Site(code=siteLangCode, fam=siteFamily)
log = codecs.open('logs/remove-deleted-images.log', mode='at', encoding = 'utf-8')
for myImage in pagegenerators.LogpagesPageGenerator(number=100,mode='delete',namespace=[6]):
wikipedia.output('\n' + myImage.title())
if not myImage.exists():
try:
if myImage.fileIsOnCommons():
wikipedia.output('File is on Commons')
except pywikibot.NoPage:
wikipedia.output('File does not exists, removing from linked pages')
for myLinkedPage in myImage.usingPages():
if myLinkedPage.namespace()<>0: continue #skip if not main namespace
wikipedia.output('Removing from ' + myLinkedPage.title())
myText=myLinkedPage.get()
for myImageNamespace in imageNamespaces:
myImageTitle=myImage.titleWithoutNamespace()
mySearchStringNormal=getGenericText(myImageNamespace + ':' + myImageTitle)
mySearchStringLower=getGenericText(myImageNamespace + ':' + myImageTitle.replace(myImageTitle[0],myImageTitle[0].lower(),1)) #first letter might be lower
wikipedia.output('Checking for ' + mySearchStringNormal + ' or ' + mySearchStringLower)
myOccuranceNormal=getGenericText(myText).count(mySearchStringNormal)
myOccuranceLower=0
if mySearchStringNormal<>mySearchStringLower:
myOccuranceLower=getGenericText(myText).count(mySearchStringLower)
if myOccuranceNormal>0:
myText=hideImageText(myText,mySearchStringNormal,myOccuranceNormal)
if myOccuranceLower>0:
myText=hideImageText(myText,mySearchStringLower,myOccuranceLower)
myLinkedPage.put(myText,comment=ur'യന്ത്രം: നിലവിലില്ലാത്ത ' + myImageTitle.title() + ur' എന്ന ചിത്രം ഒഴിവാക്കുന്നു')
else:
wikipedia.output('File exists')
wikipedia.stopme()
log.flush()
log.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.