Last active
February 6, 2024 18:45
-
-
Save ChrisRomp/f038cc233d99eaf578065723bade2a26 to your computer and use it in GitHub Desktop.
APIM Load Balancing Policy - Round Robin with Azure OpenAI (AOAI)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- This shows the policy as implemented with references to {{named values}} and fragments --> | |
<!-- Named values: https://learn.microsoft.com/en-us/azure/api-management/api-management-howto-properties --> | |
<!-- Policy fragments: https://learn.microsoft.com/en-us/azure/api-management/policy-fragments --> | |
<policies> | |
<inbound> | |
<base /> | |
<!-- This requires enabling the managed identity on APIM, and granting it access to AOAI --> | |
<authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="msi-access-token" ignore-error="false" /> | |
<set-header name="Authorization" exists-action="override"> | |
<value>@("Bearer " + (string)context.Variables["msi-access-token"])</value> | |
</set-header> | |
<!-- Get a set of backend URLs from named values --> | |
<set-variable name="backend0" value="{{aoai-backend-0}}" /> | |
<set-variable name="backend1" value="{{aoai-backend-1}}" /> | |
<set-variable name="backend2" value="{{aoai-backend-2}}" /> | |
<!-- Check the cache for a counter --> | |
<cache-lookup-value key="backend-counter" variable-name="backend-counter" /> | |
<choose> | |
<!-- Cache miss, so initalize value and cache it --> | |
<when condition="@(!context.Variables.ContainsKey("backend-counter"))"> | |
<set-variable name="backend-counter" value="0" /> | |
<cache-store-value key="backend-counter" value="0" duration="100" /> | |
</when> | |
</choose> | |
<!-- Policy fragment to choose a backend based on the cached index, then update cache. See effective policy. --> | |
<include-fragment fragment-id="aoai-select-backend-cached" /> | |
</inbound> | |
<backend> | |
<!-- This retry policy will fire on any response code >= 400 (including 429), calling the next AOAI instance --> | |
<retry condition="@(context.Response.StatusCode >= 400)" count="3" interval="5" first-fast-retry="true"> | |
<cache-lookup-value key="backend-counter" variable-name="backend-counter" /> | |
<!-- Same policy fragment as before --> | |
<include-fragment fragment-id="aoai-select-backend-cached" /> | |
<forward-request buffer-request-body="true" /> | |
</retry> | |
</backend> | |
<outbound> | |
<base /> | |
</outbound> | |
<on-error> | |
<base /> | |
</on-error> | |
</policies> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- This shows the effective policy, including inheritance and the outputs of named values and policy fragments --> | |
<policies> | |
<inbound> | |
<authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="msi-access-token" ignore-error="false" /> | |
<set-header name="Authorization" exists-action="override"> | |
<value>@("Bearer " + (string)context.Variables["msi-access-token"])</value> | |
</set-header> | |
<set-variable name="backend0" value="https://cr1-openai-ncus1.openai.azure.com/openai/" /> | |
<set-variable name="backend1" value="https://cr1-openai-eastus-02.openai.azure.com/openai/" /> | |
<set-variable name="backend2" value="https://cr1-openai-canadaeast1.openai.azure.com/openai/" /> | |
<cache-lookup-value key="backend-counter" variable-name="backend-counter" /> | |
<choose> | |
<when condition="@(!context.Variables.ContainsKey("backend-counter"))"> | |
<set-variable name="backend-counter" value="0" /> | |
<cache-store-value key="backend-counter" value="0" duration="100" /> | |
</when> | |
</choose> | |
<!--include-fragment: Begin aoai-select-backend-cached policy fragment scope--> | |
<choose> | |
<when condition="@(Convert.ToInt32(context.Variables["backend-counter"]) == 0)"> | |
<set-backend-service base-url="@((string)context.Variables["backend0"])" /> | |
<set-variable name="backend-counter" value="1" /> | |
<cache-store-value key="backend-counter" value="1" duration="100" /> | |
</when> | |
<when condition="@(Convert.ToInt32(context.Variables["backend-counter"]) == 1)"> | |
<set-backend-service base-url="@((string)context.Variables["backend1"])" /> | |
<set-variable name="backend-counter" value="2" /> | |
<cache-store-value key="backend-counter" value="2" duration="100" /> | |
</when> | |
<otherwise> | |
<set-backend-service base-url="@((string)context.Variables["backend2"])" /> | |
<set-variable name="backend-counter" value="0" /> | |
<cache-store-value key="backend-counter" value="0" duration="100" /> | |
</otherwise> | |
</choose> | |
<!--include-fragment: End aoai-select-backend-cached policy fragment scope--> | |
</inbound> | |
<backend> | |
<retry condition="@(context.Response.StatusCode >= 400)" count="3" interval="5" first-fast-retry="true"> | |
<cache-lookup-value key="backend-counter" variable-name="backend-counter" /> | |
<!--include-fragment: Begin aoai-select-backend-cached policy fragment scope--> | |
<choose> | |
<when condition="@(Convert.ToInt32(context.Variables["backend-counter"]) == 0)"> | |
<set-backend-service base-url="@((string)context.Variables["backend0"])" /> | |
<set-variable name="backend-counter" value="1" /> | |
<cache-store-value key="backend-counter" value="1" duration="100" /> | |
</when> | |
<when condition="@(Convert.ToInt32(context.Variables["backend-counter"]) == 1)"> | |
<set-backend-service base-url="@((string)context.Variables["backend1"])" /> | |
<set-variable name="backend-counter" value="2" /> | |
<cache-store-value key="backend-counter" value="2" duration="100" /> | |
</when> | |
<otherwise> | |
<set-backend-service base-url="@((string)context.Variables["backend2"])" /> | |
<set-variable name="backend-counter" value="0" /> | |
<cache-store-value key="backend-counter" value="0" duration="100" /> | |
</otherwise> | |
</choose> | |
<!--include-fragment: End aoai-select-backend-cached policy fragment scope--> | |
<forward-request buffer-request-body="true" /> | |
</retry> | |
</backend> | |
<outbound /> | |
<on-error /> | |
</policies> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Credit to: https://github.com/ian-t-adams/azure-openai-api-m-retry
Another load balancing approach, with an active/passive(/passive) approach: https://techcommunity.microsoft.com/t5/fasttrack-for-azure/smart-load-balancing-for-openai-endpoints-and-azure-api/ba-p/3991616