Remove deleted images from mlwikipedia. List of deleted images (100) are taken from deletion log
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
നീക്കം ചെയ്ത പ്രമാണങ്ങൾ താളുകളിൽ നിന്നും ഒഴിവാക്കുന്നതിന് | |
നിർമ്മിച്ചത്: സുനിൽ വി.എസ്. | |
തിയതി: 2010-11-27 | |
""" | |
import wikipedia | |
import pagegenerators | |
import unicodedata | |
import codecs | |
import re | |
import pywikibot | |
imageNamespaces = [ur'file',ur'File', ur'image', ur'Image', ur'ചിത്രം', ur'പ്രമാണം'] | |
#to replace all whitespaces in text to underscore | |
def getGenericText(pText): | |
return pText.replace(' ','_') | |
#to remove imagetext from pagetext | |
def hideImageText(pText,pImageText,pOccurances): | |
for i in range(pOccurances): | |
myLocation=getGenericText(pText).find(pImageText) | |
myStartLocation=0 | |
myEndLocation=0 | |
#If it is from gallery | |
if pText[myLocation-1]=='\n': | |
wikipedia.output('Removing from Gallery') | |
myEndLocation=pText.find('\n',myLocation+1) | |
myStartLocation=myLocation | |
#If it is image inclusion | |
elif pText[myLocation-2:myLocation]=='[[': | |
wikipedia.output ('Removing image') | |
myLPCount=0 #Left paranthesis count ( [ ) | |
myRPCount=0 #Right paranthesis count ( ] ) | |
j=2 | |
while True: | |
if pText[myLocation+j]=='[': | |
myLPCount+=1 | |
if pText[myLocation+j]==']': | |
myRPCount+=1 | |
j+=1 | |
if myRPCount-myLPCount==2: | |
break | |
myEndLocation=myLocation+j | |
myStartLocation=myLocation-2 | |
wikipedia.output(pText[myLocation-2:myEndLocation]) | |
myString=pText[myStartLocation:myEndLocation] | |
myStringPart1=pText[0:myStartLocation] | |
myStringPart2=pText[myEndLocation+1:] | |
pText=myStringPart1+'<!--' + myString + '\n-->\n' + myStringPart2 | |
return pText | |
#main program starts here | |
siteFamily = 'wikipedia' | |
siteLangCode = 'ml' | |
wikiSite = wikipedia.Site(code=siteLangCode, fam=siteFamily) | |
log = codecs.open('logs/remove-deleted-images.log', mode='at', encoding = 'utf-8') | |
for myImage in pagegenerators.LogpagesPageGenerator(number=100,mode='delete',namespace=[6]): | |
wikipedia.output('\n' + myImage.title()) | |
if not myImage.exists(): | |
try: | |
if myImage.fileIsOnCommons(): | |
wikipedia.output('File is on Commons') | |
except pywikibot.NoPage: | |
wikipedia.output('File does not exists, removing from linked pages') | |
for myLinkedPage in myImage.usingPages(): | |
if myLinkedPage.namespace()<>0: continue #skip if not main namespace | |
wikipedia.output('Removing from ' + myLinkedPage.title()) | |
myText=myLinkedPage.get() | |
for myImageNamespace in imageNamespaces: | |
myImageTitle=myImage.titleWithoutNamespace() | |
mySearchStringNormal=getGenericText(myImageNamespace + ':' + myImageTitle) | |
mySearchStringLower=getGenericText(myImageNamespace + ':' + myImageTitle.replace(myImageTitle[0],myImageTitle[0].lower(),1)) #first letter might be lower | |
wikipedia.output('Checking for ' + mySearchStringNormal + ' or ' + mySearchStringLower) | |
myOccuranceNormal=getGenericText(myText).count(mySearchStringNormal) | |
myOccuranceLower=0 | |
if mySearchStringNormal<>mySearchStringLower: | |
myOccuranceLower=getGenericText(myText).count(mySearchStringLower) | |
if myOccuranceNormal>0: | |
myText=hideImageText(myText,mySearchStringNormal,myOccuranceNormal) | |
if myOccuranceLower>0: | |
myText=hideImageText(myText,mySearchStringLower,myOccuranceLower) | |
myLinkedPage.put(myText,comment=ur'യന്ത്രം: നിലവിലില്ലാത്ത ' + myImageTitle.title() + ur' എന്ന ചിത്രം ഒഴിവാക്കുന്നു') | |
else: | |
wikipedia.output('File exists') | |
wikipedia.stopme() | |
log.flush() | |
log.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment