Skip to content

Instantly share code, notes, and snippets.

@erictleung
Created January 30, 2024 04:01
Show Gist options
  • Save erictleung/18ba6e8160ac8f8a51bce55bdb0e70f5 to your computer and use it in GitHub Desktop.
Save erictleung/18ba6e8160ac8f8a51bce55bdb0e70f5 to your computer and use it in GitHub Desktop.
Help audit, remove, and update musician infoboxes on Wikipedia
#!/usr/bin/env python3
"""
Help audit, remove, and update musician infoboxes.
https://en.wikipedia.org/wiki/Category:Pages_using_infobox_musical_artist_with_associated_acts
"""
import re
import requests
import webbrowser
import mwparserfromhell
from bs4 import BeautifulSoup as bs
# pylint: disable=line-too-long
response = requests.get(
url="https://en.wikipedia.org/wiki/Category:Pages_using_infobox_musical_artist_with_associated_acts", # noqa: E501
timeout=10,
)
soup = bs(response.content, "html.parser")
# Extract only links from the "Pages in category" and not unnecessary Wiki
# links
all_pages = soup.find("div", class_="mw-category-group").find_all("a")
# Extract text that can be inserted into a Wikipedia URL to get the page
music_pages = []
p = re.compile(r"^\/wiki\/([A-Za-z_()%0-9]*)")
for link in all_pages:
if p.match(link.get("href")) is not None:
href = link.get("href")
music_pages.append(p.match(href).group(1))
print(f"Parsed {len(music_pages)} pages.")
# Setup to extract the raw Wikitext
wiki_base = "https://en.wikipedia.org/w/index.php?title="
wiki_end = "&action=raw&ctype=text"
search = wiki_base + music_pages[0] + wiki_end
print(f"Searching for {search}")
response = requests.get(search, timeout=10)
wikicode = mwparserfromhell.parse(response.content)
# Pseudocode:
# Loop through initial pages
# Extract associated_acts value
# Extract Wikilinks and find their pages
# Go and extract past members and former members from Infobox
# Open up page
# Print out copy-paste information for new values of:
# - current_member_of=
# - past_member_of=
# - spinoff_of=
# - spinoffs=
# Pause to make manual changes
for template in wikicode.filter_templates():
if template.name.matches("Infobox musical artist"):
print("Getting the associated_acts= values:")
print(template.get("associated_acts").value.filter_wikilinks())
ac = template.get("associated_acts").value.filter_wikilinks()
for link in ac:
title = link.title
webbrowser.open()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment