turtlemonvh/s3_nested_data_counts.py

## s3_nested_data_counts.py
import boto3
from collections import Counter

"""
If your data uses "/" in a directory-like structure and you want to expand the list of items.
Similar to `tree -L2 prefix/` in *nix.
"""

s3 = boto3.client('s3')
bucket_name = "XXX" # s3 bucket name
starting_prefix = "YYY" # prefix to look under in the bucket

# Get the prefixes on the first level
prefixes = (key['Prefix'] for key in s3.list_objects_v2(Bucket=bucket_name, Delimiter="/", Prefix=starting_prefix)['CommonPrefixes'])

# Expand the list of prefixes with the next level into a flattened list
# Note that you can use something similar to this to continue to expand your prefixes more levels
expanded_prefixes = (key['Prefix'] for prefix in prefixes for key in s3.list_objects_v2(Bucket=bucket_name, Delimiter="/", Prefix=prefix)['CommonPrefixes'] )

# If you want counts of the number of times the 2nd level value shows up
# Helpful if your data is set up like "{PREFIX}/{UUID}/{DATE}" and you want to see the number of unique values of UUID for each DATE.
Counter(p.split("/")[-2] for p in expanded_prefixes)
	import boto3
	from collections import Counter

	"""
	If your data uses "/" in a directory-like structure and you want to expand the list of items.
	Similar to `tree -L2 prefix/` in *nix.
	"""

	s3 = boto3.client('s3')
	bucket_name = "XXX" # s3 bucket name
	starting_prefix = "YYY" # prefix to look under in the bucket

	# Get the prefixes on the first level
	prefixes = (key['Prefix'] for key in s3.list_objects_v2(Bucket=bucket_name, Delimiter="/", Prefix=starting_prefix)['CommonPrefixes'])

	# Expand the list of prefixes with the next level into a flattened list
	# Note that you can use something similar to this to continue to expand your prefixes more levels
	expanded_prefixes = (key['Prefix'] for prefix in prefixes for key in s3.list_objects_v2(Bucket=bucket_name, Delimiter="/", Prefix=prefix)['CommonPrefixes'] )

	# If you want counts of the number of times the 2nd level value shows up
	# Helpful if your data is set up like "{PREFIX}/{UUID}/{DATE}" and you want to see the number of unique values of UUID for each DATE.
	Counter(p.split("/")[-2] for p in expanded_prefixes)