Skip to content

Instantly share code, notes, and snippets.

@DerfJagged
Created November 1, 2023 02:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DerfJagged/e624330c09050fcd6f6f1368aa0e289b to your computer and use it in GitHub Desktop.
Save DerfJagged/e624330c09050fcd6f6f1368aa0e289b to your computer and use it in GitHub Desktop.
MediaWiki Delete Duplicate Files Script
#!/usr/bin/python3
import requests
import json
import mwclient
################### CONFIGURE ME ###################
site = mwclient.Site('YOUR_WEBSITE.com', path='/wiki/')
site.login(username='USERNAME_HERE', password='PASSWORD_HERE')
URL = "https://YOUR_WEBSITE.com/wiki/api.php"
####################################################
S = requests.Session()
PARAMS = {
"action": "query",
"list": "querypage",
"qppage": "ListDuplicatedFiles",
"qplimit": "100", #500 max
"format": "json"
}
R = S.get(url=URL, params=PARAMS)
pages_with_duplicates_json = R.json()
title_list = ""
for result in pages_with_duplicates_json['query']['querypage']['results']:
title = result['title']
### Remove the following two lines to proceed without asking
print(f"Replace {title}? [Y/n]")
if (input("") == ''):
title_list += result['title'] + "|"
title_list = title_list[:-1]
if (title_list == ""):
print("No pages with duplicates found.")
quit()
print("\n" + title_list + "\n")
PARAMS = {
"action": "query",
"titles": title_list,
"prop": "duplicatefiles",
"format": "json"
}
R = S.get(url=URL, params=PARAMS)
data = R.json()
# Uncomment to see pages that will be replaced beforehand
#print(data)
#input("Press Enter to continue...")
# Loop through the JSON data and process duplicate files
for page_id, page_data in data['query']['pages'].items():
original_file = page_data['title']
try:
duplicate_files = page_data['duplicatefiles']
except:
print("No more duplicate files found")
quit()
for duplicate_file in duplicate_files:
duplicate_name = "File:" + duplicate_file['name']
# Get information about the duplicate file
duplicate_page = site.pages[duplicate_name]
# Check if the duplicate file exists
if duplicate_page.exists:
# Delete the duplicate file
duplicate_page.delete(reason='Duplicate file - redirecting to original')
# Edit the deleted file's page content to redirect to the original file
redirect_text = f"#REDIRECT [[{original_file}]]"
duplicate_page.save(redirect_text, summary='Redirecting duplicate file to original')
print(f"Redirected {duplicate_name} to {original_file} and deleted the duplicate.")
else:
print(f"{duplicate_name} doesn't exist.")
@DerfJagged
Copy link
Author

This script locates duplicate files (File:), deletes them, then sets the deleted page to redirect to the original file.
By default, it will ask you for each file, but you can remove lines 30 and 31 to have it delete all of them.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment