Skip to content

Instantly share code, notes, and snippets.

@MikeTrizna
Created August 17, 2022 15:22
Show Gist options
  • Save MikeTrizna/66eeabe2005213843bfe125ba070704d to your computer and use it in GitHub Desktop.
Save MikeTrizna/66eeabe2005213843bfe125ba070704d to your computer and use it in GitHub Desktop.
Downloading postcard images
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "40fcb4a7-1885-4a59-ba57-79acd0ac1540",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import requests\n",
"from PIL import Image\n",
"import io\n",
"import time\n",
"from concurrent.futures import ProcessPoolExecutor\n",
"from concurrent.futures import ThreadPoolExecutor\n",
"import concurrent.futures\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d0c65af2-7ec3-4f71-8f33-629a136c9c04",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>object_id</th>\n",
" <th>title</th>\n",
" <th>guid</th>\n",
" <th>ids_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>edanmdm-nmah_1888444</td>\n",
" <td>Two Men and a Woman Congregating Downtown</td>\n",
" <td>http://n2t.net/ark:/65665/ng49ca746b3-d555-704...</td>\n",
" <td>NMAH-AHB2018q010241</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>edanmdm-nmah_1888444</td>\n",
" <td>Two Men and a Woman Congregating Downtown</td>\n",
" <td>http://n2t.net/ark:/65665/ng49ca746b3-d555-704...</td>\n",
" <td>NMAH-AHB2018q010242</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>edanmdm-nmah_1888447</td>\n",
" <td>Portrait of Couple Standing Outside Cloth Tent</td>\n",
" <td>http://n2t.net/ark:/65665/ng49ca746b3-d558-704...</td>\n",
" <td>NMAH-AHB2018q010352</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>edanmdm-nmah_1888447</td>\n",
" <td>Portrait of Couple Standing Outside Cloth Tent</td>\n",
" <td>http://n2t.net/ark:/65665/ng49ca746b3-d558-704...</td>\n",
" <td>NMAH-AHB2018q010395</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>edanmdm-nmah_1888447</td>\n",
" <td>Portrait of Couple Standing Outside Cloth Tent</td>\n",
" <td>http://n2t.net/ark:/65665/ng49ca746b3-d558-704...</td>\n",
" <td>NMAH-AHB2018q010396</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" object_id title \\\n",
"0 edanmdm-nmah_1888444 Two Men and a Woman Congregating Downtown \n",
"1 edanmdm-nmah_1888444 Two Men and a Woman Congregating Downtown \n",
"2 edanmdm-nmah_1888447 Portrait of Couple Standing Outside Cloth Tent \n",
"3 edanmdm-nmah_1888447 Portrait of Couple Standing Outside Cloth Tent \n",
"4 edanmdm-nmah_1888447 Portrait of Couple Standing Outside Cloth Tent \n",
"\n",
" guid ids_id \n",
"0 http://n2t.net/ark:/65665/ng49ca746b3-d555-704... NMAH-AHB2018q010241 \n",
"1 http://n2t.net/ark:/65665/ng49ca746b3-d555-704... NMAH-AHB2018q010242 \n",
"2 http://n2t.net/ark:/65665/ng49ca746b3-d558-704... NMAH-AHB2018q010352 \n",
"3 http://n2t.net/ark:/65665/ng49ca746b3-d558-704... NMAH-AHB2018q010395 \n",
"4 http://n2t.net/ark:/65665/ng49ca746b3-d558-704... NMAH-AHB2018q010396 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"image_df = pd.read_csv('postcard_data.tsv', sep='\\t')\n",
"image_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "166ea557-b87a-4d8f-bddb-c4baa125d19b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['NMAH-AHB2018q010241',\n",
" 'NMAH-AHB2018q010242',\n",
" 'NMAH-AHB2018q010352',\n",
" 'NMAH-AHB2018q010395',\n",
" 'NMAH-AHB2018q010396']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ids_list = image_df['ids_id'].tolist()\n",
"ids_list[:5]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d429e89f-16be-491d-a961-a7b972a1d606",
"metadata": {},
"outputs": [],
"source": [
"def requests_PIL_download(ids_id):\n",
" width, height = np.nan, np.nan\n",
" image_url = f'https://ids.si.edu/ids/deliveryService/id/{ids_id}/500'\n",
" filename = 'images/{}.jpg'.format(ids_id)\n",
" r = requests.get(image_url, timeout=60)\n",
" if r.headers['Content-Type'] == 'image/jpeg':\n",
" with Image.open(io.BytesIO(r.content)) as im:\n",
" width, height = im.size\n",
" im.save(filename)\n",
" return {'width': width, 'height': height, 'ids_id': ids_id}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "14282845-c0cb-452d-a486-f5f38bbf7564",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'width': 339, 'height': 499, 'ids_id': 'NMAH-AHB2018q010241'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"requests_PIL_download('NMAH-AHB2018q010241')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d47a5c93-325c-470d-9094-c68a57011b36",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████| 2863/2863 [01:24<00:00, 33.73it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded 2863 images in 84.97528930000044 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"start_time = time.perf_counter()\n",
"\n",
"dimension_list = []\n",
"\n",
"with ThreadPoolExecutor(max_workers=6) as executor:\n",
" dimension_list = list(tqdm(executor.map(requests_PIL_download, ids_list),\n",
" total = len(ids_list)))\n",
"\n",
"end_time = time.perf_counter()\n",
"elapsed_time = end_time - start_time\n",
"\n",
"print(f\"Downloaded {len(dimension_list)} images in {elapsed_time} s\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a48c55a0-9d93-4014-bb33-3464649b8085",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment