Created
November 28, 2010 07:29
-
-
Save vssun/718691 to your computer and use it in GitHub Desktop.
Remove deleted images from mlwikipedia. List of deleted images (100) are taken from deletion log
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
നീക്കം ചെയ്ത പ്രമാണങ്ങൾ താളുകളിൽ നിന്നും ഒഴിവാക്കുന്നതിന് | |
നിർമ്മിച്ചത്: സുനിൽ വി.എസ്. | |
തിയതി: 2010-11-27 | |
""" | |
import wikipedia | |
import pagegenerators | |
import unicodedata | |
import codecs | |
import re | |
import pywikibot | |
imageNamespaces = [ur'file',ur'File', ur'image', ur'Image', ur'ചിത്രം', ur'പ്രമാണം'] | |
#to replace all whitespaces in text to underscore | |
def getGenericText(pText): | |
return pText.replace(' ','_') | |
#to remove imagetext from pagetext | |
def hideImageText(pText,pImageText,pOccurances): | |
for i in range(pOccurances): | |
myLocation=getGenericText(pText).find(pImageText) | |
myStartLocation=0 | |
myEndLocation=0 | |
#If it is from gallery | |
if pText[myLocation-1]=='\n': | |
wikipedia.output('Removing from Gallery') | |
myEndLocation=pText.find('\n',myLocation+1) | |
myStartLocation=myLocation | |
#If it is image inclusion | |
elif pText[myLocation-2:myLocation]=='[[': | |
wikipedia.output ('Removing image') | |
myLPCount=0 #Left paranthesis count ( [ ) | |
myRPCount=0 #Right paranthesis count ( ] ) | |
j=2 | |
while True: | |
if pText[myLocation+j]=='[': | |
myLPCount+=1 | |
if pText[myLocation+j]==']': | |
myRPCount+=1 | |
j+=1 | |
if myRPCount-myLPCount==2: | |
break | |
myEndLocation=myLocation+j | |
myStartLocation=myLocation-2 | |
wikipedia.output(pText[myLocation-2:myEndLocation]) | |
myString=pText[myStartLocation:myEndLocation] | |
myStringPart1=pText[0:myStartLocation] | |
myStringPart2=pText[myEndLocation+1:] | |
pText=myStringPart1+'<!--' + myString + '\n-->\n' + myStringPart2 | |
return pText | |
#main program starts here | |
siteFamily = 'wikipedia' | |
siteLangCode = 'ml' | |
wikiSite = wikipedia.Site(code=siteLangCode, fam=siteFamily) | |
log = codecs.open('logs/remove-deleted-images.log', mode='at', encoding = 'utf-8') | |
for myImage in pagegenerators.LogpagesPageGenerator(number=100,mode='delete',namespace=[6]): | |
wikipedia.output('\n' + myImage.title()) | |
if not myImage.exists(): | |
try: | |
if myImage.fileIsOnCommons(): | |
wikipedia.output('File is on Commons') | |
except pywikibot.NoPage: | |
wikipedia.output('File does not exists, removing from linked pages') | |
for myLinkedPage in myImage.usingPages(): | |
if myLinkedPage.namespace()<>0: continue #skip if not main namespace | |
wikipedia.output('Removing from ' + myLinkedPage.title()) | |
myText=myLinkedPage.get() | |
for myImageNamespace in imageNamespaces: | |
myImageTitle=myImage.titleWithoutNamespace() | |
mySearchStringNormal=getGenericText(myImageNamespace + ':' + myImageTitle) | |
mySearchStringLower=getGenericText(myImageNamespace + ':' + myImageTitle.replace(myImageTitle[0],myImageTitle[0].lower(),1)) #first letter might be lower | |
wikipedia.output('Checking for ' + mySearchStringNormal + ' or ' + mySearchStringLower) | |
myOccuranceNormal=getGenericText(myText).count(mySearchStringNormal) | |
myOccuranceLower=0 | |
if mySearchStringNormal<>mySearchStringLower: | |
myOccuranceLower=getGenericText(myText).count(mySearchStringLower) | |
if myOccuranceNormal>0: | |
myText=hideImageText(myText,mySearchStringNormal,myOccuranceNormal) | |
if myOccuranceLower>0: | |
myText=hideImageText(myText,mySearchStringLower,myOccuranceLower) | |
myLinkedPage.put(myText,comment=ur'യന്ത്രം: നിലവിലില്ലാത്ത ' + myImageTitle.title() + ur' എന്ന ചിത്രം ഒഴിവാക്കുന്നു') | |
else: | |
wikipedia.output('File exists') | |
wikipedia.stopme() | |
log.flush() | |
log.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment