iamaziz/read_csv_files_in_tar_gz_from_s3_bucket.py

## read_csv_files_in_tar_gz_from_s3_bucket.py
# -- read csv files from tar.gz in S3 with S3FS and tarfile (https://s3fs.readthedocs.io/en/latest/)

bucket = 'mybucket'
key = 'mycompressed_csv_files.tar.gz'

import s3fs
import tarfile
import io
import pandas as pd

fs = s3fs.S3FileSystem()

f = fs.open(f'{bucket}/{key}', 'rb')
tar = tarfile.open(f, 'r:gz')
csv_files = [f.name for f in tar.getmembers() if f.name.endswith('.csv')]
csv_file = csv_files[0] # here we read first csv file only
csv_contents = tar.extractfile(csv_file).read()
df = pd.read_csv(io.BytesIO(csv_contents), encoding='utf8')
f.close()
	# -- read csv files from tar.gz in S3 with S3FS and tarfile (https://s3fs.readthedocs.io/en/latest/)

	bucket = 'mybucket'
	key = 'mycompressed_csv_files.tar.gz'

	import s3fs
	import tarfile
	import io
	import pandas as pd

	fs = s3fs.S3FileSystem()

	f = fs.open(f'{bucket}/{key}', 'rb')
	tar = tarfile.open(f, 'r:gz')
	csv_files = [f.name for f in tar.getmembers() if f.name.endswith('.csv')]
	csv_file = csv_files[0] # here we read first csv file only
	csv_contents = tar.extractfile(csv_file).read()
	df = pd.read_csv(io.BytesIO(csv_contents), encoding='utf8')
	f.close()