Skip to content

Instantly share code, notes, and snippets.

@vssun
Created December 5, 2010 04:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vssun/728795 to your computer and use it in GitHub Desktop.
Save vssun/728795 to your computer and use it in GitHub Desktop.
Remove deleted images from mlwikipedia. List should be supplied through "pagelist.txt"
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
നീക്കം ചെയ്ത പ്രമാണങ്ങൾ താളുകളിൽ നിന്നും ഒഴിവാക്കുന്നതിന്
നിർമ്മിച്ചത്: സുനിൽ വി.എസ്.
തിയതി: 2010-11-27
"""
import wikipedia
import pagegenerators
import unicodedata
import codecs
import re
import pywikibot
imageNamespaces = [ur'file',ur'File', ur'image', ur'Image', ur'ചിത്രം', ur'പ്രമാണം']
#to replace all whitespaces in text to underscore
def getGenericText(pText):
return pText.replace(' ','_')
#to remove imagetext from pagetext
def hideImageText(pText,pImageText,pOccurances):
for i in range(pOccurances):
myLocation=getGenericText(pText).find(pImageText)
myStartLocation=0
myEndLocation=0
#If it is from gallery
if pText[myLocation-1]=='\n':
wikipedia.output('Removing from Gallery')
myEndLocation=pText.find('\n',myLocation+1)
myStartLocation=myLocation
#If it is image inclusion
elif pText[myLocation-2:myLocation]=='[[':
wikipedia.output ('Removing image')
myLPCount=0 #Left paranthesis count ( [ )
myRPCount=0 #Right paranthesis count ( ] )
j=2
while True:
if pText[myLocation+j]=='[':
myLPCount+=1
if pText[myLocation+j]==']':
myRPCount+=1
j+=1
if myRPCount-myLPCount==2:
break
myEndLocation=myLocation+j
myStartLocation=myLocation-2
wikipedia.output(pText[myLocation-2:myEndLocation])
myString=pText[myStartLocation:myEndLocation]
myStringPart1=pText[0:myStartLocation]
myStringPart2=pText[myEndLocation+1:]
pText=myStringPart1+'<!--' + myString + '\n-->\n' + myStringPart2
return pText
#main program starts here
siteFamily = 'wikipedia'
siteLangCode = 'ml'
wikiSite = wikipedia.Site(code=siteLangCode, fam=siteFamily)
log = codecs.open('logs/remove-deleted-images.log', mode='at', encoding = 'utf-8')
for myPage in pagegenerators.TextfilePageGenerator('pagelist.txt'):
myImage=wikipedia.ImagePage(myPage.site(),myPage.titleWithoutNamespace())
wikipedia.output('\n' + myImage.title())
if not myImage.exists():
try:
if myImage.fileIsOnCommons():
wikipedia.output('File is on Commons')
except pywikibot.NoPage:
wikipedia.output('File does not exists, removing from linked pages')
for myLinkedPage in myImage.usingPages():
if myLinkedPage.namespace()<>0: continue #skip if not main namespace
wikipedia.output('Removing from ' + myLinkedPage.title())
myText=myLinkedPage.get()
for myImageNamespace in imageNamespaces:
myImageTitle=myImage.titleWithoutNamespace()
mySearchStringNormal=getGenericText(myImageNamespace + ':' + myImageTitle)
mySearchStringLower=getGenericText(myImageNamespace + ':' + myImageTitle.replace(myImageTitle[0],myImageTitle[0].lower(),1)) #first letter might be lower
wikipedia.output('Checking for ' + mySearchStringNormal + ' or ' + mySearchStringLower)
myOccuranceNormal=getGenericText(myText).count(mySearchStringNormal)
myOccuranceLower=0
if mySearchStringNormal<>mySearchStringLower:
myOccuranceLower=getGenericText(myText).count(mySearchStringLower)
if myOccuranceNormal>0:
myText=hideImageText(myText,mySearchStringNormal,myOccuranceNormal)
if myOccuranceLower>0:
myText=hideImageText(myText,mySearchStringLower,myOccuranceLower)
myLinkedPage.put(myText,comment=ur'യന്ത്രം: നിലവിലില്ലാത്ത ' + myImageTitle.title() + ur' എന്ന ചിത്രം ഒഴിവാക്കുന്നു')
else:
wikipedia.output('File exists')
wikipedia.stopme()
log.flush()
log.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment