Skip to content

Instantly share code, notes, and snippets.

@stuartlangridge
Created February 20, 2021 19:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stuartlangridge/bdcffe7201e12d0a9521b4cf89b7c1e0 to your computer and use it in GitHub Desktop.
Save stuartlangridge/bdcffe7201e12d0a9521b4cf89b7c1e0 to your computer and use it in GitHub Desktop.
Pub quiz question: which country has produced the most million selling popstars per square km? https://twitter.com/dsquareddigest/status/1363092145301827586
#!/usr/bin/env python3
import requests
import requests_cache
import urllib.parse
import os.path
requests_cache.install_cache(os.path.join(os.path.dirname(__file__), 'million_sellers'))
#https://www.officialcharts.com/chart-news/the-best-selling-singles-of-all-time-on-the-official-uk-chart__21298/
artists = \
["ELTON JOHN", "BAND AID", "QUEEN", "WINGS", "JOHN TRAVOLTA & OLIVIA NEWTON-JOHN",
"FRANKIE GOES TO HOLLYWOOD", "BONEY M", "PHARRELL WILLIAMS", "THE BEATLES",
"WHAM!", "WET WET WET", "BONEY M", "STEVIE WONDER", "ROBSON GREEN & JEROME FLYNN",
"BRYAN ADAMS", "AQUA", "CHER", "THE BEATLES", "WILL YOUNG", "JOHN LENNON",
"MARK RONSON FT BRUNO MARS", "SURVIVOR", "ROBIN THICKE/TI/PHARRELL",
"PUFF DADDY & FAITH EVANS", "ADELE", "WHITNEY HOUSTON", "HUMAN LEAGUE",
"JOHN TRAVOLTA & OLIVIA NEWTON-JOHN", "BADDIEL & SKINNER & LIGHTNING SEEDS",
"FRANKIE GOES TO HOLLYWOOD", "COOLIO FEATURING L.V.", "CELINE DION",
"BRITNEY SPEARS", "VARIOUS ARTISTS", "THE BEATLES",
"MAROON 5 FEATURING CHRISTINA AGUILERA", "GEORGE MICHAEL", "CULTURE CLUB",
"OASIS", "GOTYE FEATURING KIMBRA", "KEN DODD", "VILLAGE PEOPLE", "AVICII",
"BLACK EYED PEAS", "DAFT PUNK FT PHARRELL WILLIAMS", "FUGEES", "JENNIFER RUSH",
"DEXY'S MIDNIGHT RUNNERS", "BILL HALEY & HIS COMETS", "SHAGGY FEATURING RIKROK",
"RIHANNA FEATURING CALVIN HARRIS", "THE SEEKERS", "THE BEATLES",
"KINGS OF LEON", "THE BEATLES", "SPICE GIRLS", "ENGELBERT HUMPERDINCK",
"CELINE DION", "ALL SAINTS", "SOFT CELL", "CARLY RAE JEPSEN",
"TONY CHRISTIE FT PETER KAY", "BRUNO MARS", "GARETH GATES",
"RUN-D.M.C. VS JASON NEVINS", "BLONDIE", "DAVID GUETTA FT SIA",
"ALEXANDRA BURKE", "JAMES ARTHUR", "SLADE", "PSY", "JOHN LEGEND",
"CLEAN BANDIT FT JESS GLYNNE", "LMFAO FEATURING LAUREN BENNETT AND GOONROCK",
"EMINEM FEATURING RIHANNA", "ELVIS PRESLEY", "POGUES FT KIRSTY MACCOLL",
"TOM JONES", "NEW ORDER", "PASSENGER", "ED SHEERAN", "PAUL ANKA",
"ROBBIE WILLIAMS", "MARIAH CAREY", "JESSIE J FEATURING B.O.B",
"ART GARFUNKEL", "STEPS",
"MICHAEL JACKSON", "NATALIE IMBRUGLIA", "AEROSMITH", "KYLIE MINOGUE",
"WHIGFIELD", "SNOW PATROL", "LADY GAGA", "MR ACKER BILK AND HIS PARAMOUNT JAZZ BAND",
"HARRY BELAFONTE", "ENGELBERT HUMPERDINCK", "BAND AID 20",
"RAY PARKER JR.", "RIHANNA", "DAVID SOUL", "BABYLON ZOO", "BOYZONE",
"PINK FLOYD", "THE BEATLES", "FUN FT JANELLE MONAE", "ABBA",
"ONEREPUBLIC", "GARY GLITTER", "SPICE GIRLS", "GNARLS BARKLEY",
"EIFFEL 65", "DONNA SUMMER", "KATY PERRY", "IAN DURY AND THE BLOCKHEADS",
"ED SHEERAN", "JOURNEY", "KINGS OF LEON", "TAKE THAT", "ROBSON & JEROME",
"LEONA LEWIS", "SHAYNE WARD", "RIHANNA", "EMINEM",
"THE RIGHTEOUS BROTHERS", "KATY PERRY", "TELETUBBIES", "FRANK IFIELD",
"ADELE", "HEAR'SAY", "BLACK BOX", "GLORIA GAYNOR", "TAKE THAT",
"IRENE CARA", "UB40", "NAUGHTY BOY FT SAM SMITH", "BILLY JOEL",
"ROD STEWART", "CLIFF RICHARD & THE SHADOWS", "SWEDISH HOUSE MAFIA/MARTIN",
"ADAM AND THE ANTS", "PITBULL FEATURING NE-YO, AFROJACK AND NAYER",
"LADY GAGA",
"BING CROSBY WITH THE KEN DARBY SINGERS AND JOHN SCOTT TROTTER ORCHESTRA",
"TIGHT FIT", "PETER ANDRE FEATURING BUBBLER RANX", "ENRIQUE IGLESIAS",
"THE ARCHIES", "CHERYL COLE", "ELTON JOHN & KIKI DEE", "LOU BEGA", "BLACK EYED PEAS",
"JULIE COVINGTON", "CHRISTINA PERRI", "KILLERS", "BOB THE BUILDER", "BROTHERHOOD OF MAN",
"ATOMIC KITTEN", "NO DOUBT", "GERRY & THE PACEMAKERS", "NICKI MINAJ", "ADELE",
"ONE DIRECTION", "OASIS", "MATT CARDLE", "THE NEW SEEKERS", "SHOWADDYWADDY",
"DAWN FEATURING TONY ORLANDO", "RICK ASTLEY", "KYLIE MINOGUE & JASON DONOVAN",
"COLDPLAY", "BRUNO MARS", "THE SIMON PARK ORCHESTRA", "CEE LO GREEN", "FRANK SINATRA",
"LITTLE JIMMY OSMOND", "LMFAO", "GOO GOO DOLLS"]
SPLITTERS = ["FT", "FEATURING"]
KINDS = {
"Q5741069": "rock group",
"Q5": "person",
"Q215380": "musical group",
"Q9212979": "musical duo",
"Q641066": "girl group",
"Q7623897": "all-female band",
"Q71129815": "electronica duo",
"Q216337": "boy band",
"Q6619719": "fictional musical group"
}
def wikidata(q):
qs = urllib.parse.urlencode({
"action": "query",
"list": "search",
"srsearch": q,
"format": "json"
})
parts = list(urllib.parse.urlparse("https://www.wikidata.org/w/api.php?qs=no"))
parts[4] = qs
url = urllib.parse.urlunparse(parts)
r = requests.get(url)
if not r.json()["query"]["search"]: return None
entityid = r.json()["query"]["search"][0]["title"]
if "album" in r.json()["query"]["search"][0]["snippet"] or "department" in r.json()["query"]["search"][0]["snippet"]:
entityid = r.json()["query"]["search"][1]["title"]
if "album" in r.json()["query"]["search"][1]["snippet"]:
entityid = r.json()["query"]["search"][2]["title"]
eurl =f"https://www.wikidata.org/wiki/Special:EntityData/{entityid}.json"
r2 = requests.get(eurl)
details = r2.json()
try:
kind = details["entities"][entityid]["claims"]["P31"][0]["mainsnak"]["datavalue"]["value"]["id"]
except KeyError:
return None
if kind not in KINDS:
return None
claims = details["entities"][entityid]["claims"]
if "P27" in claims:
country_entityid = claims["P27"][0]["mainsnak"]["datavalue"]["value"]["id"]
elif "P495" in claims:
country_entityid = claims["P495"][0]["mainsnak"]["datavalue"]["value"]["id"]
elif "P740" in claims:
location_entityid = claims["P740"][0]["mainsnak"]["datavalue"]["value"]["id"]
lurl =f"https://www.wikidata.org/wiki/Special:EntityData/{location_entityid}.json"
r3 = requests.get(lurl)
ldetails = r3.json()
P17s = ldetails["entities"][location_entityid]["claims"]["P17"]
country_entityid = P17s[-1]["mainsnak"]["datavalue"]["value"]["id"]
curl =f"https://www.wikidata.org/wiki/Special:EntityData/{country_entityid}.json"
r3 = requests.get(curl)
cdetails = r3.json()
name = cdetails["entities"][country_entityid]["labels"]["en"]["value"]
area = int(float(cdetails["entities"][country_entityid]["claims"]["P2046"][0]["mainsnak"]["datavalue"]["value"]["amount"].replace("+", "")))
return (q, name, area)
by_country = {}
for artist in artists:
for item in SPLITTERS:
if f" {item} " in artist:
artist = artist.split(f" {item} ")[0]
res = wikidata(artist)
if res:
artist, country, area = res
if country not in by_country:
by_country[country] = {"area": area, "artists": []}
by_country[country]["artists"].append(artist)
results = []
for c in by_country:
by_country[c]["artist_per_m"] = 1000000.0 * len(by_country[c]["artists"]) / by_country[c]["area"]
results.append((c, by_country[c]["artists"], by_country[c]["area"], by_country[c]["artist_per_m"]))
results.sort(key=lambda n:n[3], reverse=True)
for country, artists, area, artists_per_million_km in results:
print(f"\033[1m{country}\033[0m ({artists_per_million_km:.2f} artists per mkm²)")
if len(artists) > 5:
print(len(artists), "artists")
else:
print(", ".join([x.title() for x in list(set(artists))]))
@stuartlangridge
Copy link
Author

stuartlangridge commented Feb 20, 2021

This makes a bunch of very dubious assumptions -- artists called "FRED FEATURING BLOGGS" are treated as just "FRED", Wikidata's "place of formation" and "country of origin" entries are correct, anyone without a Wikidata entry under the band name doesn't exist, which penalises duos a bit, and so on -- but hey, now you, gentle reader, have the script and can tweak it to your heart's content to make it rigorous (or at least to show some semblance of rigour!)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment