Skip to content

Instantly share code, notes, and snippets.

@napsternxg
Created September 29, 2020 14:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save napsternxg/22b623a31487c0fc9a26babeb38cdf11 to your computer and use it in GitHub Desktop.
Save napsternxg/22b623a31487c0fc9a26babeb38cdf11 to your computer and use it in GitHub Desktop.
Get humans with sex, gender, ethnic_group, and image in Wikidata https://w.wiki/eLe
from pathlib import Path
import shutil
import requests
REQUIRED_COLS = ['human', 'image', 'sex_or_gender', 'ethnic_group', 'date_of_birth', 'occupation', 'loc_aid',]
def parse_row(row):
data = {}
for c in REQUIRED_COLS:
value = row[c]["value"]
if row[c]["type"] == "uri":
value = value.rsplit("/", 1)[-1]
data[c] = value
url = row["url"]
extension = Path(url.rsplit('/', 1)[-1]).suffix
local_path = f"{data['human']}{extension}"
data["url"] = url
data["local_path"] = local_path
return data
OUTPUT_DIR=Path("./data/wikidata_gender_ethnicity_images/")
def download_image(data):
url = data['url']
local_path = OUTPUT_DIR / data["local_path"]
if local_path.exists():
print(f"{url} already saved at {local_path}")
return
# Code based on https://stackoverflow.com/a/13137873
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(local_path, 'wb') as fp:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, fp)
print(f"Saved {url} to {local_path}")
if __name__ == "__main__":
for row in wikidata_data["results"]["bindings"]:
data = parse_row(row)
download_image(data)
import pandas as pd
df = pd.DataFrame([
parse_row(row)
for row in wikidata_data["results"]["bindings"]
])
df["file_exists"] = df["local_path"].apply(lambda x: (OUTPUT_DIR / x).exists())
print(df["ethnic_group"].value_counts())
#defaultView:ImageGrid
SELECT ?human
(SAMPLE(?humanLabel) AS ?humanLabel)
(SAMPLE(?image) AS ?image)
(SAMPLE(?sex_or_gender) AS ?sex_or_gender)
(SAMPLE(?sex_or_genderLabel) AS ?sex_or_genderLabel)
(SAMPLE(?ethnic_group) AS ?ethnic_group)
(SAMPLE(?ethnic_groupLabel) AS ?ethnic_groupLabel)
(SAMPLE(?date_of_birth) AS ?date_of_birth)
(SAMPLE(?occupation) AS ?occupation)
(SAMPLE(?occupationLabel) AS ?occupationLabel)
(SAMPLE(?loc_aid) AS ?loc_aid)
WHERE {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
?human wdt:P31 wd:Q5;
wdt:P18 ?image;
wdt:P21 ?sex_or_gender;
wdt:P172 ?ethnic_group;
wdt:P569 ?date_of_birth;
wdt:P244 ?loc_aid
OPTIONAL { ?human wdt:P106 ?occupation. }
FILTER(?date_of_birth > "1950-01-01"^^xsd:dateTime)
FILTER(?ethnic_group IN(wd:Q49078, wd:Q49085, wd:Q58669))
FILTER(?sex_or_gender IN(wd:Q6581097, wd:Q6581072))
FILTER(?occupation NOT IN(wd:Q488111))
}
GROUP BY ?human
LIMIT 5000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment