Last active
February 16, 2017 15:55
-
-
Save dodo5522/95bf80671adf94a309fceb7f2b029a68 to your computer and use it in GitHub Desktop.
MNISTデータをダウンロードしてgzip伸長するpythonコード
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import aiohttp | |
import gzip | |
from io import BytesIO | |
import numpy as np | |
import pandas as pd | |
import requests | |
class MnistWrapper(object): | |
""" This class to access MNIST database easily. If MNIST data files don't | |
exist, this class instance downloads them automatically so it takes | |
some time to be done. | |
""" | |
TRAIN_DATA_URLS = ( | |
"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", | |
"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz") | |
TEST_DATA_URLS = ( | |
"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz", | |
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz") | |
def __init__(self): | |
self._train = [[], []] | |
self._test = [[], []] | |
async def _get_raw_data(self, url): | |
""" Coroutine to get gzipped data, expand and return it. """ | |
res = await aiohttp.request("GET", url) | |
gzip_data = await res.read() | |
gzip_fp = BytesIO(gzip_data) | |
with gzip.open(gzip_fp) as fp: | |
return fp.read() | |
def _get_long_number(self, bytes_): | |
def from_bytes(byte_,): | |
return int.from_bytes(byte_, "little") if isinstance(byte_, bytes) else byte_ | |
return from_bytes(bytes_[0]) << 24 | from_bytes(bytes_[1]) << 16 | from_bytes(bytes_[2]) << 8 | from_bytes(bytes_[3]) << 0 | |
def _raw_to_df(self, image, label): | |
labels = pd.Series([l for l in label[8:]]) | |
num_of_images = self._get_long_number([i for i in image[4:]]) | |
num_of_rows = self._get_long_number([i for i in image[8:]]) | |
num_of_columns = self._get_long_number([i for i in image[12:]]) | |
images = pd.DataFrame(np.array([d for d in image[16:]]).reshape(num_of_images, num_of_rows * num_of_columns)) | |
return (images, labels) | |
def get_train(self): | |
""" Get MNIST training data list. First one has X, second one has y. | |
Both is pandas DataFrame. """ | |
if len(self._train[0]) and len(self._train[1]): | |
return self._train | |
loop = asyncio.get_event_loop() | |
cors = [self._get_raw_data(url) for url in self.TRAIN_DATA_URLS] | |
train = loop.run_until_complete(asyncio.gather(*cors)) | |
loop.close() | |
self._train = self._raw_to_df(*train) | |
return self._train | |
def get_test(self): | |
""" Get MNIST test data list. First one has X, second one has y. | |
Both is pandas DataFrame. """ | |
if len(self._test[0]) and len(self._test[1]): | |
return self._test | |
loop = asyncio.get_event_loop() | |
cors = [self._get_raw_data(url) for url in self.TEST_DATA_URLS] | |
test = loop.run_until_complete(asyncio.gather(*cors)) | |
loop.close() | |
self._test = self._raw_to_df(*test) | |
return self._test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment