Created
December 5, 2010 04:34
-
-
Save vssun/728795 to your computer and use it in GitHub Desktop.
Remove deleted images from mlwikipedia. List should be supplied through "pagelist.txt"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
നീക്കം ചെയ്ത പ്രമാണങ്ങൾ താളുകളിൽ നിന്നും ഒഴിവാക്കുന്നതിന് | |
നിർമ്മിച്ചത്: സുനിൽ വി.എസ്. | |
തിയതി: 2010-11-27 | |
""" | |
import wikipedia | |
import pagegenerators | |
import unicodedata | |
import codecs | |
import re | |
import pywikibot | |
imageNamespaces = [ur'file',ur'File', ur'image', ur'Image', ur'ചിത്രം', ur'പ്രമാണം'] | |
#to replace all whitespaces in text to underscore | |
def getGenericText(pText): | |
return pText.replace(' ','_') | |
#to remove imagetext from pagetext | |
def hideImageText(pText,pImageText,pOccurances): | |
for i in range(pOccurances): | |
myLocation=getGenericText(pText).find(pImageText) | |
myStartLocation=0 | |
myEndLocation=0 | |
#If it is from gallery | |
if pText[myLocation-1]=='\n': | |
wikipedia.output('Removing from Gallery') | |
myEndLocation=pText.find('\n',myLocation+1) | |
myStartLocation=myLocation | |
#If it is image inclusion | |
elif pText[myLocation-2:myLocation]=='[[': | |
wikipedia.output ('Removing image') | |
myLPCount=0 #Left paranthesis count ( [ ) | |
myRPCount=0 #Right paranthesis count ( ] ) | |
j=2 | |
while True: | |
if pText[myLocation+j]=='[': | |
myLPCount+=1 | |
if pText[myLocation+j]==']': | |
myRPCount+=1 | |
j+=1 | |
if myRPCount-myLPCount==2: | |
break | |
myEndLocation=myLocation+j | |
myStartLocation=myLocation-2 | |
wikipedia.output(pText[myLocation-2:myEndLocation]) | |
myString=pText[myStartLocation:myEndLocation] | |
myStringPart1=pText[0:myStartLocation] | |
myStringPart2=pText[myEndLocation+1:] | |
pText=myStringPart1+'<!--' + myString + '\n-->\n' + myStringPart2 | |
return pText | |
#main program starts here | |
siteFamily = 'wikipedia' | |
siteLangCode = 'ml' | |
wikiSite = wikipedia.Site(code=siteLangCode, fam=siteFamily) | |
log = codecs.open('logs/remove-deleted-images.log', mode='at', encoding = 'utf-8') | |
for myPage in pagegenerators.TextfilePageGenerator('pagelist.txt'): | |
myImage=wikipedia.ImagePage(myPage.site(),myPage.titleWithoutNamespace()) | |
wikipedia.output('\n' + myImage.title()) | |
if not myImage.exists(): | |
try: | |
if myImage.fileIsOnCommons(): | |
wikipedia.output('File is on Commons') | |
except pywikibot.NoPage: | |
wikipedia.output('File does not exists, removing from linked pages') | |
for myLinkedPage in myImage.usingPages(): | |
if myLinkedPage.namespace()<>0: continue #skip if not main namespace | |
wikipedia.output('Removing from ' + myLinkedPage.title()) | |
myText=myLinkedPage.get() | |
for myImageNamespace in imageNamespaces: | |
myImageTitle=myImage.titleWithoutNamespace() | |
mySearchStringNormal=getGenericText(myImageNamespace + ':' + myImageTitle) | |
mySearchStringLower=getGenericText(myImageNamespace + ':' + myImageTitle.replace(myImageTitle[0],myImageTitle[0].lower(),1)) #first letter might be lower | |
wikipedia.output('Checking for ' + mySearchStringNormal + ' or ' + mySearchStringLower) | |
myOccuranceNormal=getGenericText(myText).count(mySearchStringNormal) | |
myOccuranceLower=0 | |
if mySearchStringNormal<>mySearchStringLower: | |
myOccuranceLower=getGenericText(myText).count(mySearchStringLower) | |
if myOccuranceNormal>0: | |
myText=hideImageText(myText,mySearchStringNormal,myOccuranceNormal) | |
if myOccuranceLower>0: | |
myText=hideImageText(myText,mySearchStringLower,myOccuranceLower) | |
myLinkedPage.put(myText,comment=ur'യന്ത്രം: നിലവിലില്ലാത്ത ' + myImageTitle.title() + ur' എന്ന ചിത്രം ഒഴിവാക്കുന്നു') | |
else: | |
wikipedia.output('File exists') | |
wikipedia.stopme() | |
log.flush() | |
log.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment