Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save camallen/c167a711641cf44efdab78f0bacf4405 to your computer and use it in GitHub Desktop.
Save camallen/c167a711641cf44efdab78f0bacf4405 to your computer and use it in GitHub Desktop.
Update blob content types - use a search index to generate a list of az storage cli cmds
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from urllib.parse import urlparse
from progress.bar import Bar
from sys import platform
import re
import subprocess
import mimetypes
import sys
import os
import base64
import pdb
# ensure we have our mime types db
mimetypes.init()
# list of file extensions to skip processing for...we don't care about these
skip_extensions_list = [".fz", ".fits"]
# setup our base64 decode system cmd
if platform == "linux" or platform == "linux2":
base_64_cmd = "echo {} | base64 -d"
elif platform == "darwin":
base_64_cmd = "echo {} | base64 -D"
# not used since we fixed base64 decoding via `fix_base64_encoded_string` function
def shell_base64_decode(encoded):
decode_shell_cmd = base_64_cmd.format(encoded)
shell_base64_decode = subprocess.Popen(
decode_shell_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = shell_base64_decode.communicate()
blob_file_path = stdout.decode("utf-8")
return blob_file_path
# convert the azure 'special' base64 encoded nonsense
# to correctly padded '=' base64
def fix_base64_encoded_string(base64_encoded):
base64_prefix = base64_encoded[:-1]
last_digit = base64_encoded[-1]
padding_suffix = '=' * int(last_digit)
return base64_prefix + padding_suffix
# We'll connect to the Azure Cognitive Search public sandbox and send a
# query to its "nycjobs" index built from a public dataset of available jobs
# in New York.
index_name = "azureblob-index"
# get this API key from the azure portal
api_key = os.environ['SEARCH_API_KEY']
# setup an AZ cli cmd to update the storage blob data content type
az_cmd_prefix = "az storage blob update --auth-mode login --account-name galaxyzoosubjects"
# Setup the output file for az cli commands to feed into gnu parallels
out_file_name = "www_gz_subjects_az_cli_fix_mime_type_cmds.txt"
out_file = open(out_file_name, "w")
# Create a SearchClient to send queries to our custom made gz subjects search index
endpoint = "https://gz-blob-searcher-take-2.search.windows.net/"
credential = AzureKeyCredential(api_key)
client = SearchClient(endpoint=endpoint,
index_name=index_name,
credential=credential)
# search index query setup
skip = 0 # used for paging combined with page_size
page_size = 50 # default value for $top
# search index query filter, only find the default content types for non fits files
filter = "metadata_storage_content_type eq 'application/octet-stream' and metadata_storage_file_extension ne '.fz' and metadata_storage_file_extension ne '.fits'"
total_index_count = client.search(search_text='*', include_total_count=True, filter=filter).get_count()
print("Found {} records in the index. Processing....".format(total_index_count))
# keep a count of the skipped files (.fits, etc)
skipped_file_count = 0
# setup our progress bar
bar = Bar('Processing the blob index', max=total_index_count)
# loop over all the docs in the index
while skip < total_index_count:
# ALL docs that are filtered to the 'invalid' content-type
# NOTE: for paging to work we must provide the top & skip params, without them you get a error :shrug:
# azure.core.exceptions.HttpResponseError: Operation returned an invalid status 'Forbidden'
# https://docs.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python
# The number of search results to retrieve.
# This can be used in conjunction with $skip to implement client-side paging of search results.
# If results are truncated due to server-side paging, the response will include a continuation token that can be used to issue another Search request for the next page of results.
results = client.search(search_text='*', top=page_size, skip=skip, filter=filter)
for result in results:
# update the progress bar
bar.next()
# WHY U NO base64 decode this string? It works on the termimal!?
# for some stupid reason, python base64 can't decode these encoded path strings :(
# using system shell cmds in the meantime
# This is why!? https://robertoprevato.github.io/Things-I-would-have-liked-to-know-about-Azure-Search/
# why the fuck do i have to add this in, what the fuck was azure thinking?
real_base_64_string = fix_base64_encoded_string(
result["metadata_storage_path"])
blob_file_path = base64.b64decode(real_base_64_string).decode("utf-8")
file_extension = result["metadata_storage_file_extension"]
# skip the fits files
if file_extension in skip_extensions_list:
# skip this entry as the fits files aren't for use in a browser
skipped_file_count += 1
continue
try:
correct_mime_type = mimetypes.types_map[file_extension]
except KeyError:
error_msg = "Can't find the MIME type for the file path: {}\n with file extension: {}".format(
blob_file_path, file_extension)
# let's stop now and report the error
sys.exit(error_msg)
if correct_mime_type == result["metadata_storage_content_type"]:
# skip this entry if the content type doesn't change
continue
# so we've got two options here:
# 1. use the python sdk to loop through all the search index results iteratively (slow)
# 2. use something like gnu parallel and our az storage cmd to update the content types
# e.g. a bash cmd like this would update the gz blob at -n path with correct mime type.
# i'm going to pursue gnu parallel short term and see how it goes
blob_url_parts = urlparse(blob_file_path)
blob_url_path_parts = blob_url_parts.path.split("/")
container_name = blob_url_path_parts[1]
# get rid of the '' and container name in the list
del blob_url_path_parts[0:2]
blob_container_path = '/'.join(blob_url_path_parts)
# $ az storage blob update --auth-mode login --account-name galaxyzoosubjects -c '$web' -n subjects/decals/thumbnail/J211326.08+005811.6_thumbnail.jpeg --content-type 'image/jpeg'
az_cmd_suffix = "-c '{}' -n {} --content-type '{}'".format(
container_name, blob_container_path, correct_mime_type)
blob_update_az_cmd = "{} {}".format(az_cmd_prefix, az_cmd_suffix)
# write the az cli cmd to the output file
print(blob_update_az_cmd, file=out_file)
# we've processed the result set, setup the next page (50)
skip += page_size
# print out the total count
print("Found {} records in the index. Skipped {} files. Expecting {} uniq rows in the output file {}".format(
total_index_count, skipped_file_count, (total_index_count-skipped_file_count), out_file_name))
# cleanup our output file handle
out_file.close()
# finish the progress bar
bar.finish()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment