Last active
July 17, 2017 13:13
-
-
Save yassineAlouini/2225de648c8337815418e287e306dfe5 to your computer and use it in GitHub Desktop.
Download a tar.gz archive from s3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import s3fs | |
s3 = s3fs.S3FileSystem() | |
S3_BUCKET = '' | |
BASE_KEY = '' | |
def get_data_from_s3(file_name): | |
key = BASE_KEY.format('data', file_name) | |
s3.get('s3://{}/{}'.format(S3_BUCKET, key), '/tmp/{}'.format(file_name)) | |
dfs = [] | |
# Read the tarball into Pandas | |
with tarfile.open('/tmp/{}'.format(file_name), "r:gz") as tar: | |
for member in tar.getmembers(): | |
f = tar.extractfile(member) | |
if '.csv' in member.name: | |
floor = re.findall(r'(-?\d+)', member.name)[0] | |
tmp_df = pd.read_csv(f).assign(floor=floor) | |
dfs.append(tmp_df) | |
df = pd.concat(dfs) | |
return df | |
def get_features_from_s3(file_name): | |
key = BASE_KEY.format('models', file_name) | |
s3.get('s3://{}/{}'.format(S3_BUCKET, key), '/tmp/{}'.format(file_name)) | |
features = [] | |
with tarfile.open('/tmp/{}'.format(file_name), "r:gz") as tar: | |
for member in tar.getmembers(): | |
f = tar.extractfile(member) | |
if '.json' in member.name: | |
d = json.load(f) | |
features.append(d) | |
return features |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment