camallen/search_index_update_subject_mime_types_gz_storage.py

## search_index_update_subject_mime_types_gz_storage.py
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from urllib.parse import urlparse
from progress.bar import Bar
from sys import platform
import re
import subprocess
import mimetypes
import sys
import os
import base64

import pdb

# ensure we have our mime types db
mimetypes.init()

# list of file extensions to skip processing for...we don't care about these
skip_extensions_list = [".fz", ".fits"]

# setup our base64 decode system cmd
if platform == "linux" or platform == "linux2":
    base_64_cmd = "echo {} | base64 -d"
elif platform == "darwin":
    base_64_cmd = "echo {} | base64 -D"

# not used since we fixed base64 decoding via `fix_base64_encoded_string` function
def shell_base64_decode(encoded):
  decode_shell_cmd = base_64_cmd.format(encoded)
  shell_base64_decode = subprocess.Popen(
    decode_shell_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  stdout, stderr = shell_base64_decode.communicate()
  blob_file_path = stdout.decode("utf-8")
  return blob_file_path

# convert the azure 'special' base64 encoded nonsense
# to correctly padded '=' base64
def fix_base64_encoded_string(base64_encoded):
  base64_prefix = base64_encoded[:-1]
  last_digit = base64_encoded[-1]
  padding_suffix = '=' * int(last_digit)
  return base64_prefix + padding_suffix

# We'll connect to the Azure Cognitive Search public sandbox and send a
# query to its "nycjobs" index built from a public dataset of available jobs
# in New York.
index_name = "azureblob-index"

# get this API key from the azure portal
api_key = os.environ['SEARCH_API_KEY']

# setup an AZ cli cmd to update the storage blob data content type
az_cmd_prefix = "az storage blob update --auth-mode login --account-name galaxyzoosubjects"

# Setup the output file for az cli commands to feed into gnu parallels
out_file_name = "www_gz_subjects_az_cli_fix_mime_type_cmds.txt"
out_file = open(out_file_name, "w")

# Create a SearchClient to send queries to our custom made gz subjects search index
endpoint = "https://gz-blob-searcher-take-2.search.windows.net/"
credential = AzureKeyCredential(api_key)
client = SearchClient(endpoint=endpoint,
                      index_name=index_name,
                      credential=credential)

# search index query setup
skip = 0 # used for paging combined with page_size
page_size = 50  # default value for $top
# search index query filter, only find the default content types for non fits files
filter = "metadata_storage_content_type eq 'application/octet-stream' and metadata_storage_file_extension ne '.fz' and metadata_storage_file_extension ne '.fits'"
total_index_count = client.search(search_text='*', include_total_count=True, filter=filter).get_count()
print("Found {} records in the index. Processing....".format(total_index_count))
# keep a count of the skipped files (.fits, etc)
skipped_file_count = 0

# setup our progress bar
bar = Bar('Processing the blob index', max=total_index_count)

# loop over all the docs in the index
while skip < total_index_count:
  # ALL docs that are filtered to the 'invalid' content-type
  # NOTE: for paging to work we must provide the top & skip params, without them you get a error :shrug:
  #       azure.core.exceptions.HttpResponseError: Operation returned an invalid status 'Forbidden'
  # https://docs.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python
  #   The number of search results to retrieve.
  #   This can be used in conjunction with $skip to implement client-side paging of search results.
  #   If results are truncated due to server-side paging, the response will include a continuation token that can be used to issue another Search request for the next page of results.
  results = client.search(search_text='*', top=page_size, skip=skip, filter=filter)

  for result in results:
    # update the progress bar
    bar.next()

    # WHY U NO base64 decode this string? It works on the termimal!?
    # for some stupid reason, python base64 can't decode these encoded path strings :(
    # using system shell cmds in the meantime
    # This is why!? https://robertoprevato.github.io/Things-I-would-have-liked-to-know-about-Azure-Search/

    # why the fuck do i have to add this in, what the fuck was azure thinking?
    real_base_64_string = fix_base64_encoded_string(
        result["metadata_storage_path"])
    blob_file_path = base64.b64decode(real_base_64_string).decode("utf-8")

    file_extension = result["metadata_storage_file_extension"]
    # skip the fits files
    if file_extension in skip_extensions_list:
      # skip this entry as the fits files aren't for use in a browser
      skipped_file_count += 1
      continue
    try:
      correct_mime_type = mimetypes.types_map[file_extension]
    except KeyError:
      error_msg = "Can't find the MIME type for the file path: {}\n with file extension: {}".format(
          blob_file_path, file_extension)
      # let's stop now and report the error
      sys.exit(error_msg)

    if correct_mime_type == result["metadata_storage_content_type"]:
      # skip this entry if the content type doesn't change
      continue

    # so we've got two options here:
    # 1. use the python sdk to loop through all the search index results iteratively (slow)
    # 2. use something like gnu parallel and our az storage cmd to update the content types
    # e.g. a bash cmd like this would update the gz blob at -n path with correct mime type.

    # i'm going to pursue gnu parallel short term and see how it goes
    blob_url_parts = urlparse(blob_file_path)
    blob_url_path_parts = blob_url_parts.path.split("/")
    container_name = blob_url_path_parts[1]
    # get rid of the '' and container name in the list
    del blob_url_path_parts[0:2]
    blob_container_path = '/'.join(blob_url_path_parts)

    # $ az storage blob update --auth-mode login --account-name galaxyzoosubjects -c '$web' -n subjects/decals/thumbnail/J211326.08+005811.6_thumbnail.jpeg --content-type 'image/jpeg'
    az_cmd_suffix = "-c '{}' -n {} --content-type '{}'".format(
        container_name, blob_container_path, correct_mime_type)

    blob_update_az_cmd = "{} {}".format(az_cmd_prefix, az_cmd_suffix)

    # write the az cli cmd to the output file
    print(blob_update_az_cmd, file=out_file)


  # we've processed the result set, setup the next page (50)
  skip += page_size

# print out the total count
print("Found {} records in the index. Skipped {} files. Expecting {} uniq rows in the output file {}".format(
    total_index_count, skipped_file_count, (total_index_count-skipped_file_count), out_file_name))
# cleanup our output file handle
out_file.close()
# finish the progress bar
bar.finish()
	from azure.core.credentials import AzureKeyCredential
	from azure.search.documents import SearchClient
	from urllib.parse import urlparse
	from progress.bar import Bar
	from sys import platform
	import re
	import subprocess
	import mimetypes
	import sys
	import os
	import base64

	import pdb

	# ensure we have our mime types db
	mimetypes.init()

	# list of file extensions to skip processing for...we don't care about these
	skip_extensions_list = [".fz", ".fits"]

	# setup our base64 decode system cmd
	if platform == "linux" or platform == "linux2":
	base_64_cmd = "echo {} \| base64 -d"
	elif platform == "darwin":
	base_64_cmd = "echo {} \| base64 -D"

	# not used since we fixed base64 decoding via `fix_base64_encoded_string` function
	def shell_base64_decode(encoded):
	decode_shell_cmd = base_64_cmd.format(encoded)
	shell_base64_decode = subprocess.Popen(
	decode_shell_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
	stdout, stderr = shell_base64_decode.communicate()
	blob_file_path = stdout.decode("utf-8")
	return blob_file_path

	# convert the azure 'special' base64 encoded nonsense
	# to correctly padded '=' base64
	def fix_base64_encoded_string(base64_encoded):
	base64_prefix = base64_encoded[:-1]
	last_digit = base64_encoded[-1]
	padding_suffix = '=' * int(last_digit)
	return base64_prefix + padding_suffix

	# We'll connect to the Azure Cognitive Search public sandbox and send a
	# query to its "nycjobs" index built from a public dataset of available jobs
	# in New York.
	index_name = "azureblob-index"

	# get this API key from the azure portal
	api_key = os.environ['SEARCH_API_KEY']

	# setup an AZ cli cmd to update the storage blob data content type
	az_cmd_prefix = "az storage blob update --auth-mode login --account-name galaxyzoosubjects"

	# Setup the output file for az cli commands to feed into gnu parallels
	out_file_name = "www_gz_subjects_az_cli_fix_mime_type_cmds.txt"
	out_file = open(out_file_name, "w")

	# Create a SearchClient to send queries to our custom made gz subjects search index
	endpoint = "https://gz-blob-searcher-take-2.search.windows.net/"
	credential = AzureKeyCredential(api_key)
	client = SearchClient(endpoint=endpoint,
	index_name=index_name,
	credential=credential)

	# search index query setup
	skip = 0 # used for paging combined with page_size
	page_size = 50 # default value for $top
	# search index query filter, only find the default content types for non fits files
	filter = "metadata_storage_content_type eq 'application/octet-stream' and metadata_storage_file_extension ne '.fz' and metadata_storage_file_extension ne '.fits'"
	total_index_count = client.search(search_text='*', include_total_count=True, filter=filter).get_count()
	print("Found {} records in the index. Processing....".format(total_index_count))
	# keep a count of the skipped files (.fits, etc)
	skipped_file_count = 0

	# setup our progress bar
	bar = Bar('Processing the blob index', max=total_index_count)

	# loop over all the docs in the index
	while skip < total_index_count:
	# ALL docs that are filtered to the 'invalid' content-type
	# NOTE: for paging to work we must provide the top & skip params, without them you get a error :shrug:
	# azure.core.exceptions.HttpResponseError: Operation returned an invalid status 'Forbidden'
	# https://docs.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python
	# The number of search results to retrieve.
	# This can be used in conjunction with $skip to implement client-side paging of search results.
	# If results are truncated due to server-side paging, the response will include a continuation token that can be used to issue another Search request for the next page of results.
	results = client.search(search_text='*', top=page_size, skip=skip, filter=filter)

	for result in results:
	# update the progress bar
	bar.next()

	# WHY U NO base64 decode this string? It works on the termimal!?
	# for some stupid reason, python base64 can't decode these encoded path strings :(
	# using system shell cmds in the meantime
	# This is why!? https://robertoprevato.github.io/Things-I-would-have-liked-to-know-about-Azure-Search/

	# why the fuck do i have to add this in, what the fuck was azure thinking?
	real_base_64_string = fix_base64_encoded_string(
	result["metadata_storage_path"])
	blob_file_path = base64.b64decode(real_base_64_string).decode("utf-8")

	file_extension = result["metadata_storage_file_extension"]
	# skip the fits files
	if file_extension in skip_extensions_list:
	# skip this entry as the fits files aren't for use in a browser
	skipped_file_count += 1
	continue
	try:
	correct_mime_type = mimetypes.types_map[file_extension]
	except KeyError:
	error_msg = "Can't find the MIME type for the file path: {}\n with file extension: {}".format(
	blob_file_path, file_extension)
	# let's stop now and report the error
	sys.exit(error_msg)

	if correct_mime_type == result["metadata_storage_content_type"]:
	# skip this entry if the content type doesn't change
	continue

	# so we've got two options here:
	# 1. use the python sdk to loop through all the search index results iteratively (slow)
	# 2. use something like gnu parallel and our az storage cmd to update the content types
	# e.g. a bash cmd like this would update the gz blob at -n path with correct mime type.

	# i'm going to pursue gnu parallel short term and see how it goes
	blob_url_parts = urlparse(blob_file_path)
	blob_url_path_parts = blob_url_parts.path.split("/")
	container_name = blob_url_path_parts[1]
	# get rid of the '' and container name in the list
	del blob_url_path_parts[0:2]
	blob_container_path = '/'.join(blob_url_path_parts)

	# $ az storage blob update --auth-mode login --account-name galaxyzoosubjects -c '$web' -n subjects/decals/thumbnail/J211326.08+005811.6_thumbnail.jpeg --content-type 'image/jpeg'
	az_cmd_suffix = "-c '{}' -n {} --content-type '{}'".format(
	container_name, blob_container_path, correct_mime_type)

	blob_update_az_cmd = "{} {}".format(az_cmd_prefix, az_cmd_suffix)

	# write the az cli cmd to the output file
	print(blob_update_az_cmd, file=out_file)


	# we've processed the result set, setup the next page (50)
	skip += page_size

	# print out the total count
	print("Found {} records in the index. Skipped {} files. Expecting {} uniq rows in the output file {}".format(
	total_index_count, skipped_file_count, (total_index_count-skipped_file_count), out_file_name))
	# cleanup our output file handle
	out_file.close()
	# finish the progress bar
	bar.finish()