Get humans with sex, gender, ethnic_group, and image in Wikidata https://w.wiki/eLe
from pathlib import Path | |
import shutil | |
import requests | |
REQUIRED_COLS = ['human', 'image', 'sex_or_gender', 'ethnic_group', 'date_of_birth', 'occupation', 'loc_aid',] | |
def parse_row(row): | |
data = {} | |
for c in REQUIRED_COLS: | |
value = row[c]["value"] | |
if row[c]["type"] == "uri": | |
value = value.rsplit("/", 1)[-1] | |
data[c] = value | |
url = row["url"] | |
extension = Path(url.rsplit('/', 1)[-1]).suffix | |
local_path = f"{data['human']}{extension}" | |
data["url"] = url | |
data["local_path"] = local_path | |
return data | |
OUTPUT_DIR=Path("./data/wikidata_gender_ethnicity_images/") | |
def download_image(data): | |
url = data['url'] | |
local_path = OUTPUT_DIR / data["local_path"] | |
if local_path.exists(): | |
print(f"{url} already saved at {local_path}") | |
return | |
# Code based on https://stackoverflow.com/a/13137873 | |
r = requests.get(url, stream=True) | |
if r.status_code == 200: | |
with open(local_path, 'wb') as fp: | |
r.raw.decode_content = True | |
shutil.copyfileobj(r.raw, fp) | |
print(f"Saved {url} to {local_path}") | |
if __name__ == "__main__": | |
for row in wikidata_data["results"]["bindings"]: | |
data = parse_row(row) | |
download_image(data) | |
import pandas as pd | |
df = pd.DataFrame([ | |
parse_row(row) | |
for row in wikidata_data["results"]["bindings"] | |
]) | |
df["file_exists"] = df["local_path"].apply(lambda x: (OUTPUT_DIR / x).exists()) | |
print(df["ethnic_group"].value_counts()) |
#defaultView:ImageGrid | |
SELECT ?human | |
(SAMPLE(?humanLabel) AS ?humanLabel) | |
(SAMPLE(?image) AS ?image) | |
(SAMPLE(?sex_or_gender) AS ?sex_or_gender) | |
(SAMPLE(?sex_or_genderLabel) AS ?sex_or_genderLabel) | |
(SAMPLE(?ethnic_group) AS ?ethnic_group) | |
(SAMPLE(?ethnic_groupLabel) AS ?ethnic_groupLabel) | |
(SAMPLE(?date_of_birth) AS ?date_of_birth) | |
(SAMPLE(?occupation) AS ?occupation) | |
(SAMPLE(?occupationLabel) AS ?occupationLabel) | |
(SAMPLE(?loc_aid) AS ?loc_aid) | |
WHERE { | |
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } | |
?human wdt:P31 wd:Q5; | |
wdt:P18 ?image; | |
wdt:P21 ?sex_or_gender; | |
wdt:P172 ?ethnic_group; | |
wdt:P569 ?date_of_birth; | |
wdt:P244 ?loc_aid | |
OPTIONAL { ?human wdt:P106 ?occupation. } | |
FILTER(?date_of_birth > "1950-01-01"^^xsd:dateTime) | |
FILTER(?ethnic_group IN(wd:Q49078, wd:Q49085, wd:Q58669)) | |
FILTER(?sex_or_gender IN(wd:Q6581097, wd:Q6581072)) | |
FILTER(?occupation NOT IN(wd:Q488111)) | |
} | |
GROUP BY ?human | |
LIMIT 5000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment