diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 1ee49f64d072..f2750ff49b78 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -179,7 +179,7 @@ local function timer_create_checker() if not res_conf then goto continue end - local upstream + local ok, upstream, err local plugin_name = get_plugin_name(resource_path) if plugin_name and plugin_name ~= "" then local _, sub_path = config_util.parse_path(resource_path) @@ -189,7 +189,14 @@ local function timer_create_checker() --- callback construct_upstream to create an upstream dynamically local upstream_constructor_config = jp.value(res_conf.value, json_path) local plugin = require("apisix.plugins." .. plugin_name) - upstream = plugin.construct_upstream(upstream_constructor_config) + ok, upstream, err = pcall(plugin.construct_upstream, upstream_constructor_config) + if not ok or not upstream then + err = err or upstream + core.log.error("[creating checker] unable to construct upstream", + " for plugin: ", plugin_name, ", resource path: ", resource_path, + ", json path: ", json_path, ", error: ", err) + goto continue + end upstream.resource_key = resource_path else upstream = res_conf.value.upstream or res_conf.value @@ -237,7 +244,7 @@ local function timer_working_pool_check() local res_conf = resource.fetch_latest_conf(resource_path) local need_destroy = true if res_conf and res_conf.value then - local upstream + local ok, upstream, err local plugin_name = get_plugin_name(resource_path) if plugin_name and plugin_name ~= "" then local _, sub_path = config_util.parse_path(resource_path) @@ -247,17 +254,37 @@ local function timer_working_pool_check() --- callback construct_upstream to create an upstream dynamically local upstream_constructor_config = jp.value(res_conf.value, json_path) local plugin = require("apisix.plugins." .. plugin_name) - upstream = plugin.construct_upstream(upstream_constructor_config) - upstream.resource_key = resource_path + ok, upstream, err = pcall(plugin.construct_upstream, upstream_constructor_config) + if not ok or not upstream then + -- a nil constructor config means the instance was removed, so let + -- the checker be destroyed; otherwise keep it through a transient failure + if upstream_constructor_config ~= nil then + need_destroy = false + end + err = err or upstream or "unknown error" + upstream = nil + local err_msg = "[checking checker] unable to construct upstream for plugin: " + .. plugin_name .. ", resource path: " .. resource_path + .. ", json path: " .. json_path .. ", error: " .. err + if not ok then + core.log.error(err_msg) + else + core.log.warn(err_msg) + end + else + upstream.resource_key = resource_path + end else upstream = res_conf.value.upstream or res_conf.value end - local current_ver = upstream_utils.version(res_conf.modifiedIndex, - upstream._nodes_ver) - core.log.info("checking working pool for resource: ", resource_path, - " current version: ", current_ver, " item version: ", item.version) - if item.version == current_ver then - need_destroy = false + if upstream then + local current_ver = upstream_utils.version(res_conf.modifiedIndex, + upstream._nodes_ver) + core.log.info("checking working pool for resource: ", resource_path, + " current version: ", current_ver, " item version: ", item.version) + if item.version == current_ver then + need_destroy = false + end end end diff --git a/t/plugin/ai-proxy-multi-construct-upstream-panic.t b/t/plugin/ai-proxy-multi-construct-upstream-panic.t new file mode 100644 index 000000000000..6a8db2787379 --- /dev/null +++ b/t/plugin/ai-proxy-multi-construct-upstream-panic.t @@ -0,0 +1,628 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +use t::APISIX 'no_plan'; + +log_level("info"); +repeat_each(1); +no_long_string(); +no_shuffle(); +no_root_location(); + + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!defined $block->request) { + $block->set_value("request", "GET /t"); + } + + my $user_yaml_config = <<_EOC_; +plugins: + - ai-proxy-multi +_EOC_ + $block->set_value("extra_yaml_config", $user_yaml_config); + + my $http_config = $block->http_config // <<_EOC_; + server { + server_name openai; + listen 127.0.0.1:16724; + + default_type 'application/json'; + + location /v1/chat/completions { + content_by_lua_block { + ngx.say([[{"choices":[{"message":{"content":"ok","role":"assistant"}}]}]]) + } + } + + location /status { + content_by_lua_block { + ngx.say("ok") + } + } + } +_EOC_ + + $block->set_value("http_config", $http_config); +}); + +run_tests(); + +__DATA__ + +=== TEST 1: panic in construct_upstream during checker creation must not crash the timer +# construct_upstream is overridden to raise a Lua error on every call. The +# healthcheck manager creation timer wraps it in pcall, logs the failure and +# skips the bad resource instead of aborting checker creation for the whole +# worker. Proxying is unaffected. +--- extra_init_worker_by_lua + local plugin = require "apisix.plugins.ai-proxy-multi" + plugin.construct_upstream = function(instance) + local panic_check + panic_check.cnt = 1 + end +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local json = require("cjson.safe") + + local checks = { + active = { + type = "http", + http_path = "/status", + healthy = { + interval = 1, + successes = 1, + }, + unhealthy = { + interval = 1, + http_failures = 2, + }, + }, + } + + local route = { + uri = "/ai", + plugins = { + ["ai-proxy-multi"] = { + fallback_strategy = "instance_health_and_rate_limiting", + instances = { + { + name = "ai-instance", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-4", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + checks = checks, + }, + { + name = "ai-instance-2", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-3", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + }, + }, + ssl_verify = false, + }, + }, + } + + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + json.encode(route) + ) + assert(code < 300, body) + + local function send_ai() + local code = t("/ai", + ngx.HTTP_POST, + json.encode({messages = {{role = "user", content = "hi"}}}), + nil, + { + ["Content-Type"] = "application/json", + } + ) + return code + end + + -- queues the checker creation, which the timer picks up next second + assert(send_ai() == 200, "first request should succeed") + ngx.sleep(2) + assert(send_ai() == 200, "second request should succeed") + ngx.say("passed") + } + } +--- timeout: 10 +--- response_body +passed +--- error_log +[creating checker] unable to construct upstream for plugin: ai-proxy-multi, resource path: /apisix/routes/1#plugins['ai-proxy-multi'].instances[0], json path: $.plugins['ai-proxy-multi'].instances[0] +'panic_check' (a nil value) + + + +=== TEST 2: panic in construct_upstream during working pool check must not crash the timer +# The first call (creation timer) succeeds so a checker lands in the working +# pool; later calls (the working pool check timer) raise a Lua error. The +# check timer wraps construct_upstream in pcall, logs and keeps the entry +# instead of crashing the worker. +--- extra_init_worker_by_lua + local plugin = require "apisix.plugins.ai-proxy-multi" + local old_func = plugin.construct_upstream + local cnt = 0 + plugin.construct_upstream = function(instance) + cnt = cnt + 1 + if cnt <= 1 then + return old_func(instance) + end + local panic_check + panic_check.cnt = cnt + end +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local json = require("cjson.safe") + + local checks = { + active = { + type = "http", + http_path = "/status", + healthy = { + interval = 1, + successes = 1, + }, + unhealthy = { + interval = 1, + http_failures = 2, + }, + }, + } + + local route = { + uri = "/ai", + plugins = { + ["ai-proxy-multi"] = { + fallback_strategy = "instance_health_and_rate_limiting", + instances = { + { + name = "ai-instance", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-4", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + checks = checks, + }, + { + name = "ai-instance-2", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-3", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + }, + }, + ssl_verify = false, + }, + }, + } + + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + json.encode(route) + ) + assert(code < 300, body) + + local function send_ai() + local code = t("/ai", + ngx.HTTP_POST, + json.encode({messages = {{role = "user", content = "hi"}}}), + nil, + { + ["Content-Type"] = "application/json", + } + ) + return code + end + + assert(send_ai() == 200, "first request should succeed") + ngx.sleep(2) + assert(send_ai() == 200, "second request should succeed") + ngx.say("passed") + } + } +--- timeout: 10 +--- response_body +passed +--- error_log +[checking checker] unable to construct upstream for plugin: ai-proxy-multi, resource path: /apisix/routes/1#plugins['ai-proxy-multi'].instances[0], json path: $.plugins['ai-proxy-multi'].instances[0] +'panic_check' (a nil value) + + + +=== TEST 3: construct_upstream returning nil, err during checker creation must not crash the timer +# construct_upstream returns its normal failure tuple (nil, err) instead of +# throwing. The creation timer logs the error and skips the bad resource. +--- extra_init_worker_by_lua + local plugin = require "apisix.plugins.ai-proxy-multi" + plugin.construct_upstream = function(instance) + return nil, "boom" + end +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local json = require("cjson.safe") + + local checks = { + active = { + type = "http", + http_path = "/status", + healthy = { + interval = 1, + successes = 1, + }, + unhealthy = { + interval = 1, + http_failures = 2, + }, + }, + } + + local route = { + uri = "/ai", + plugins = { + ["ai-proxy-multi"] = { + fallback_strategy = "instance_health_and_rate_limiting", + instances = { + { + name = "ai-instance", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-4", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + checks = checks, + }, + { + name = "ai-instance-2", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-3", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + }, + }, + ssl_verify = false, + }, + }, + } + + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + json.encode(route) + ) + assert(code < 300, body) + + local function send_ai() + local code = t("/ai", + ngx.HTTP_POST, + json.encode({messages = {{role = "user", content = "hi"}}}), + nil, + { + ["Content-Type"] = "application/json", + } + ) + return code + end + + assert(send_ai() == 200, "first request should succeed") + ngx.sleep(2) + assert(send_ai() == 200, "second request should succeed") + ngx.say("passed") + } + } +--- timeout: 10 +--- response_body +passed +--- error_log +[creating checker] unable to construct upstream for plugin: ai-proxy-multi, resource path: /apisix/routes/1#plugins['ai-proxy-multi'].instances[0], json path: $.plugins['ai-proxy-multi'].instances[0], error: boom + + + +=== TEST 4: construct_upstream returning a bare nil during working pool check must not crash the timer +# The first call (creation timer) succeeds; later calls return a bare nil with +# no error string. The check timer must not crash on the nil error and must +# keep the existing checker instead of destroying it. +--- extra_init_worker_by_lua + local plugin = require "apisix.plugins.ai-proxy-multi" + local old_func = plugin.construct_upstream + local cnt = 0 + plugin.construct_upstream = function(instance) + cnt = cnt + 1 + if cnt <= 1 then + return old_func(instance) + end + return nil + end +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local json = require("cjson.safe") + + local checks = { + active = { + type = "http", + http_path = "/status", + healthy = { + interval = 1, + successes = 1, + }, + unhealthy = { + interval = 1, + http_failures = 2, + }, + }, + } + + local route = { + uri = "/ai", + plugins = { + ["ai-proxy-multi"] = { + fallback_strategy = "instance_health_and_rate_limiting", + instances = { + { + name = "ai-instance", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-4", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + checks = checks, + }, + { + name = "ai-instance-2", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-3", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + }, + }, + ssl_verify = false, + }, + }, + } + + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + json.encode(route) + ) + assert(code < 300, body) + + local function send_ai() + local code = t("/ai", + ngx.HTTP_POST, + json.encode({messages = {{role = "user", content = "hi"}}}), + nil, + { + ["Content-Type"] = "application/json", + } + ) + return code + end + + assert(send_ai() == 200, "first request should succeed") + ngx.sleep(2) + assert(send_ai() == 200, "second request should succeed") + ngx.say("passed") + } + } +--- timeout: 10 +--- response_body +passed +--- error_log +[checking checker] unable to construct upstream for plugin: ai-proxy-multi, resource path: /apisix/routes/1#plugins['ai-proxy-multi'].instances[0], json path: $.plugins['ai-proxy-multi'].instances[0], error: unknown error + + + +=== TEST 5: removed instance must destroy its checker instead of leaking it +# A checker is created for instances[0], then the ai-proxy-multi plugin is +# removed from the route. The json path no longer resolves, so +# construct_upstream gets a nil config; the working pool check must treat that +# as a real removal and release the stale checker. +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local json = require("cjson.safe") + + local checks = { + active = { + type = "http", + http_path = "/status", + healthy = { + interval = 1, + successes = 1, + }, + unhealthy = { + interval = 1, + http_failures = 2, + }, + }, + } + + local route = { + uri = "/ai", + plugins = { + ["ai-proxy-multi"] = { + fallback_strategy = "instance_health_and_rate_limiting", + instances = { + { + name = "ai-instance", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-4", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + checks = checks, + }, + { + name = "ai-instance-2", + provider = "openai", + weight = 1, + auth = { + header = { + Authorization = "Bearer token", + }, + }, + options = { + model = "gpt-3", + }, + override = { + endpoint = "http://127.0.0.1:16724", + }, + }, + }, + ssl_verify = false, + }, + }, + } + + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + json.encode(route) + ) + assert(code < 300, body) + + local function send_ai() + local code = t("/ai", + ngx.HTTP_POST, + json.encode({messages = {{role = "user", content = "hi"}}}), + nil, + { + ["Content-Type"] = "application/json", + } + ) + return code + end + + -- create the checker for instances[0] + assert(send_ai() == 200, "request should succeed") + ngx.sleep(2) + + -- drop ai-proxy-multi so the instance's json path resolves to nil + local plain = { + uri = "/ai", + upstream = { + type = "roundrobin", + nodes = { + ["127.0.0.1:16724"] = 1, + }, + }, + } + code, body = t('/apisix/admin/routes/1', ngx.HTTP_PUT, json.encode(plain)) + assert(code < 300, body) + ngx.sleep(2) + ngx.say("passed") + } + } +--- timeout: 10 +--- response_body +passed +--- error_log +try to release checker: +for resource: /apisix/routes/1#plugins['ai-proxy-multi'].instances[0] and version