Skip to content

Instantly share code, notes, and snippets.

@dodo5522
Last active February 16, 2017 15:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dodo5522/95bf80671adf94a309fceb7f2b029a68 to your computer and use it in GitHub Desktop.
Save dodo5522/95bf80671adf94a309fceb7f2b029a68 to your computer and use it in GitHub Desktop.
MNISTデータをダウンロードしてgzip伸長するpythonコード
import asyncio
import aiohttp
import gzip
from io import BytesIO
import numpy as np
import pandas as pd
import requests
class MnistWrapper(object):
""" This class to access MNIST database easily. If MNIST data files don't
exist, this class instance downloads them automatically so it takes
some time to be done.
"""
TRAIN_DATA_URLS = (
"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")
TEST_DATA_URLS = (
"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")
def __init__(self):
self._train = [[], []]
self._test = [[], []]
async def _get_raw_data(self, url):
""" Coroutine to get gzipped data, expand and return it. """
res = await aiohttp.request("GET", url)
gzip_data = await res.read()
gzip_fp = BytesIO(gzip_data)
with gzip.open(gzip_fp) as fp:
return fp.read()
def _get_long_number(self, bytes_):
def from_bytes(byte_,):
return int.from_bytes(byte_, "little") if isinstance(byte_, bytes) else byte_
return from_bytes(bytes_[0]) << 24 | from_bytes(bytes_[1]) << 16 | from_bytes(bytes_[2]) << 8 | from_bytes(bytes_[3]) << 0
def _raw_to_df(self, image, label):
labels = pd.Series([l for l in label[8:]])
num_of_images = self._get_long_number([i for i in image[4:]])
num_of_rows = self._get_long_number([i for i in image[8:]])
num_of_columns = self._get_long_number([i for i in image[12:]])
images = pd.DataFrame(np.array([d for d in image[16:]]).reshape(num_of_images, num_of_rows * num_of_columns))
return (images, labels)
def get_train(self):
""" Get MNIST training data list. First one has X, second one has y.
Both is pandas DataFrame. """
if len(self._train[0]) and len(self._train[1]):
return self._train
loop = asyncio.get_event_loop()
cors = [self._get_raw_data(url) for url in self.TRAIN_DATA_URLS]
train = loop.run_until_complete(asyncio.gather(*cors))
loop.close()
self._train = self._raw_to_df(*train)
return self._train
def get_test(self):
""" Get MNIST test data list. First one has X, second one has y.
Both is pandas DataFrame. """
if len(self._test[0]) and len(self._test[1]):
return self._test
loop = asyncio.get_event_loop()
cors = [self._get_raw_data(url) for url in self.TEST_DATA_URLS]
test = loop.run_until_complete(asyncio.gather(*cors))
loop.close()
self._test = self._raw_to_df(*test)
return self._test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment