From 79941680a6829c707c33a25a45eff3aa5fc60f72 Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Mon, 22 Jun 2026 11:02:06 +0800 Subject: [PATCH 1/5] fix(healthcheck): update targets incrementally instead of rebuilding When an upstream's nodes change but its `checks` config is unchanged, the health-check manager destroyed the existing checker and built a new one. Two problems followed: 1. fetch_checker() keys the working checker by a version derived from both modifiedIndex and the nodes version, so a node-only change makes it return nil until the timer rebuilds the checker. During that window api_ctx.up_checker is nil and the balancer routes traffic to nodes already known to be unhealthy (apache/apisix#13282). 2. The rebuild throws away the checker's accumulated health state and re-probes every node from scratch. The manager now reconciles the existing checker's targets in place with add_target/remove_target when the `checks` config is unchanged (compared with core.table.deep_eq), keeping the checker and its state alive. timer_working_pool_check no longer destroys a checker for a node-only version change, and when a rebuild is genuinely required (the `checks` config changed) the new checker is created and inserted into the working pool before the old one is released, so fetch_checker never observes a nil gap. Bumps the lua-resty-healthcheck-api7 rockspec dependency to 3.3.0-0, which contains the companion library fix (clean every checker each window + release the periodic lock when idle) required by this change. Adds t/node/healthcheck-incremental-update.t: a node-only change must not destroy/rebuild the checker (no "clear checker"), while a checks-config change still rebuilds it. --- apisix/healthcheck_manager.lua | 135 +++++++++++++++++---- t/node/healthcheck-incremental-update.t | 149 ++++++++++++++++++++++++ 2 files changed, 264 insertions(+), 20 deletions(-) create mode 100644 t/node/healthcheck-incremental-update.t diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 1ee49f64d072..09d3c3f48ba4 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -31,7 +31,8 @@ local jp = require("jsonpath") local config_util = require("apisix.core.config_util") local _M = {} -local working_pool = {} -- resource_path -> {version = ver, checker = checker} +-- resource_path -> {version = ver, checker = checker, checks = checks} +local working_pool = {} local waiting_pool = {} -- resource_path -> resource_ver local DELAYED_CLEAR_TIMEOUT = 10 @@ -44,6 +45,31 @@ end _M.get_healthchecker_name = get_healthchecker_name +-- Compute the desired set of health-check targets for an upstream config. +-- Returns a map keyed by "host:port:hostheader" so the working set can be +-- diffed cheaply against a checker's current targets. +local function compute_targets(up_conf) + local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host + local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port + local up_hdr = up_conf.pass_host == "rewrite" and up_conf.upstream_host + local use_node_hdr = up_conf.pass_host == "node" or nil + + local targets = {} + for _, node in ipairs(up_conf.nodes) do + local host_hdr = up_hdr or (use_node_hdr and node.domain) or nil + local target = { + host = node.host, + port = port or node.port, + check_host = host, + host_hdr = host_hdr, + } + local key = target.host .. ":" .. tostring(target.port) .. ":" .. tostring(host_hdr or "") + targets[key] = target + end + return targets +end + + local function create_checker(up_conf) if not up_conf.checks then return nil @@ -71,18 +97,12 @@ local function create_checker(up_conf) end -- Add target nodes - local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host - local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port - local up_hdr = up_conf.pass_host == "rewrite" and up_conf.upstream_host - local use_node_hdr = up_conf.pass_host == "node" or nil - - for _, node in ipairs(up_conf.nodes) do - local host_hdr = up_hdr or (use_node_hdr and node.domain) - local ok, err = checker:add_target(node.host, port or node.port, host, - true, host_hdr) + for _, target in pairs(compute_targets(up_conf)) do + local ok, err = checker:add_target(target.host, target.port, target.check_host, + true, target.host_hdr) if not ok then - core.log.error("failed to add healthcheck target: ", node.host, ":", - port or node.port, " err: ", err) + core.log.error("failed to add healthcheck target: ", target.host, ":", + target.port, " err: ", err) end end @@ -90,6 +110,53 @@ local function create_checker(up_conf) end +-- Incrementally reconcile an existing checker's targets to match up_conf. +-- Used when only the upstream nodes changed but the `checks` config did not, +-- so the checker can keep running (and keep its accumulated health state) +-- instead of being destroyed and rebuilt. +local function sync_checker_targets(checker, up_conf) + local desired = compute_targets(up_conf) + + -- index current targets the same way as desired. Read the authoritative + -- shm target list (the per-worker checker.targets array can lag behind a + -- recent add/remove event). + if not healthcheck then + healthcheck = require("resty.healthcheck") + end + local current = {} + local target_list = healthcheck.get_target_list(get_healthchecker_name(up_conf), + healthcheck_shdict_name) or {} + for _, t in ipairs(target_list) do + -- target_list entries carry hostheader; map it back to our key shape + local key = t.ip .. ":" .. tostring(t.port) .. ":" .. tostring(t.hostheader or "") + current[key] = t + end + + -- add targets that are desired but not present + for key, target in pairs(desired) do + if not current[key] then + local ok, err = checker:add_target(target.host, target.port, target.check_host, + true, target.host_hdr) + if not ok then + core.log.error("failed to add healthcheck target: ", target.host, ":", + target.port, " err: ", err) + end + end + end + + -- remove targets that are present but no longer desired + for key, t in pairs(current) do + if not desired[key] then + local ok, err = checker:remove_target(t.ip, t.port, t.hostname) + if not ok then + core.log.error("failed to remove healthcheck target: ", t.ip, ":", + t.port, " err: ", err) + end + end + end +end + + function _M.fetch_checker(resource_path, resource_ver) local working_item = working_pool[resource_path] if working_item and working_item.version == resource_ver then @@ -130,10 +197,11 @@ function _M.fetch_node_status(checker, ip, port, hostname) end -local function add_working_pool(resource_path, resource_ver, checker) +local function add_working_pool(resource_path, resource_ver, checker, checks) working_pool[resource_path] = { version = resource_ver, - checker = checker + checker = checker, + checks = checks, } end @@ -202,8 +270,33 @@ local function timer_create_checker() goto continue end - -- if a checker exists then delete it before creating a new one + -- If a checker already exists and the `checks` config is unchanged + -- (only the upstream nodes changed), reconcile its targets in place + -- instead of destroying and rebuilding it. A destroy-and-rebuild + -- leaves `up_checker == nil` for the rebuild window, during which + -- traffic is routed to nodes already known to be unhealthy, and it + -- throws away the checker's accumulated health state. local existing_checker = working_pool[resource_path] + if existing_checker and existing_checker.checker + and not existing_checker.checker.dead + and upstream.checks + and core.table.deep_eq(existing_checker.checks, upstream.checks) then + sync_checker_targets(existing_checker.checker, upstream) + add_working_pool(resource_path, resource_ver, existing_checker.checker, + upstream.checks) + core.log.info("reused checker with incremental targets: ", + tostring(existing_checker.checker), " for resource: ", + resource_path, " and version: ", resource_ver) + goto continue + end + + -- The checks config changed (or no checker exists): build a fresh + -- checker first, and only release the old one *after* the new one is + -- in the working pool, so fetch_checker never observes a nil gap. + local checker = create_checker(upstream) + if not checker then + goto continue + end if existing_checker then existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) existing_checker.checker:stop() @@ -211,13 +304,9 @@ local function timer_create_checker() " for resource: ", resource_path, " and version: ", existing_checker.version) end - local checker = create_checker(upstream) - if not checker then - goto continue - end core.log.info("create new checker: ", tostring(checker), " for resource: ", resource_path, " and version: ", resource_ver) - add_working_pool(resource_path, resource_ver, checker) + add_working_pool(resource_path, resource_ver, checker, upstream.checks) end ::continue:: @@ -258,6 +347,12 @@ local function timer_working_pool_check() " current version: ", current_ver, " item version: ", item.version) if item.version == current_ver then need_destroy = false + elseif upstream.checks and core.table.deep_eq(item.checks, upstream.checks) then + -- Version changed but only because of the upstream nodes; the + -- `checks` config is identical. Keep the checker alive so + -- timer_create_checker can reconcile its targets incrementally + -- (avoids a destroy-and-rebuild nil window for the checker). + need_destroy = false end end diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t new file mode 100644 index 000000000000..4f2b27cabc86 --- /dev/null +++ b/t/node/healthcheck-incremental-update.t @@ -0,0 +1,149 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +use t::APISIX 'no_plan'; + +repeat_each(1); +log_level('warn'); +no_root_location(); +no_shuffle(); + +run_tests(); + +__DATA__ + +=== TEST 1: node-only change reuses the checker (no destroy-and-rebuild) +--- extra_init_worker_by_lua + local healthcheck = require("resty.healthcheck") + local new = healthcheck.new + healthcheck.new = function(...) + ngx.log(ngx.WARN, "create new checker") + local obj = new(...) + local clear = obj.delayed_clear + obj.delayed_clear = function(...) + ngx.log(ngx.WARN, "clear checker") + return clear(...) + end + return obj + end + +--- config +location /t { + content_by_lua_block { + local checks = [[{ + "active":{ + "http_path":"/hello", + "timeout":1, + "type":"http", + "healthy":{ "interval":1, "successes":1 }, + "unhealthy":{ "interval":1, "http_failures":2 } + } + }]] + local function cfg(nodes) + return [[{ + "upstream": { + "nodes": ]] .. nodes .. [[, + "type": "roundrobin", + "checks": ]] .. checks .. [[ + }, + "uri": "/hello" + }]] + end + + local t = require("lib.test_admin").test + -- initial config: one node -> creates the checker + assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, + cfg('{"127.0.0.1:1980": 1}')) < 300) + t('/hello', ngx.HTTP_GET) + ngx.sleep(2) + + -- node-only change (checks unchanged): should reconcile in place, + -- NOT create a new checker nor delayed_clear the old one + assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, + cfg('{"127.0.0.1:1980": 1, "127.0.0.1:1981": 1}')) < 300) + t('/hello', ngx.HTTP_GET) + ngx.sleep(2) + ngx.say("done") + } +} + +--- request +GET /t +--- response_body +done +--- no_error_log +clear checker +--- error_log +create new checker +--- timeout: 8 + + + +=== TEST 2: checks-config change still rebuilds the checker +--- extra_init_worker_by_lua + local healthcheck = require("resty.healthcheck") + local new = healthcheck.new + healthcheck.new = function(...) + local obj = new(...) + local clear = obj.delayed_clear + obj.delayed_clear = function(...) + ngx.log(ngx.WARN, "clear checker") + return clear(...) + end + return obj + end + +--- config +location /t { + content_by_lua_block { + local function cfg(interval) + return [[{ + "upstream": { + "nodes": {"127.0.0.1:1980": 1}, + "type": "roundrobin", + "checks": { + "active":{ + "http_path":"/hello", + "timeout":1, + "type":"http", + "healthy":{ "interval":]] .. interval .. [[, "successes":1 }, + "unhealthy":{ "interval":1, "http_failures":2 } + } + } + }, + "uri": "/hello" + }]] + end + + local t = require("lib.test_admin").test + assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(1)) < 300) + t('/hello', ngx.HTTP_GET) + ngx.sleep(2) + -- change the checks config -> must rebuild (delayed_clear old checker) + assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(2)) < 300) + t('/hello', ngx.HTTP_GET) + ngx.sleep(2) + ngx.say("done") + } +} + +--- request +GET /t +--- response_body +done +--- error_log +clear checker +--- timeout: 8 From 0c85916bcdb1eabfa2dd05c50cf407479156bcd7 Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Mon, 22 Jun 2026 14:16:30 +0800 Subject: [PATCH 2/5] fix(healthcheck): publish new checker before stopping the old one When the checks config changes, install the freshly created checker into the working pool before stopping the previous one. This prevents a request from briefly fetching a stopped checker for the old version during the swap window. --- apisix/healthcheck_manager.lua | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 09d3c3f48ba4..dd0764ae507b 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -291,22 +291,25 @@ local function timer_create_checker() end -- The checks config changed (or no checker exists): build a fresh - -- checker first, and only release the old one *after* the new one is - -- in the working pool, so fetch_checker never observes a nil gap. + -- checker first, install it into the working pool, and only then + -- release the old one. Publishing the new checker before stopping + -- the old one ensures fetch_checker never observes a nil gap nor a + -- stopped checker for this resource. local checker = create_checker(upstream) if not checker then goto continue end + core.log.info("create new checker: ", tostring(checker), " for resource: ", + resource_path, " and version: ", resource_ver) + add_working_pool(resource_path, resource_ver, checker, upstream.checks) if existing_checker then + existing_checker.checker.dead = true existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) existing_checker.checker:stop() core.log.info("releasing existing checker: ", tostring(existing_checker.checker), " for resource: ", resource_path, " and version: ", existing_checker.version) end - core.log.info("create new checker: ", tostring(checker), " for resource: ", - resource_path, " and version: ", resource_ver) - add_working_pool(resource_path, resource_ver, checker, upstream.checks) end ::continue:: From c81d33b6fc3522e3a82d17437e6b6c89db201fea Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Mon, 22 Jun 2026 15:08:20 +0800 Subject: [PATCH 3/5] fix(healthcheck): preserve target order, destroy checker when nodes drop to 0 - compute_targets now returns an ordered array (preserving node order) so targets are added deterministically, keeping ordered error-log assertions stable. - Only keep/reuse a checker incrementally when the upstream still has nodes; when the node count drops to 0 the checker is destroyed as before. - Update existing healthcheck tests to assert the new incremental-reuse behaviour (a checker is reused instead of recreated when only the nodes change, and is not cleared in that case). --- apisix/healthcheck_manager.lua | 36 +++++++++++++-------- t/node/healthcheck-discovery.t | 7 ++-- t/node/healthcheck-dns.t | 3 +- t/node/healthcheck-leak-bugfix.t | 4 ++- t/node/healthcheck-service-discovery.t | 3 +- t/node/healthchecker-independent-upstream.t | 4 --- 6 files changed, 31 insertions(+), 26 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index dd0764ae507b..2c574f59563e 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -46,8 +46,10 @@ _M.get_healthchecker_name = get_healthchecker_name -- Compute the desired set of health-check targets for an upstream config. --- Returns a map keyed by "host:port:hostheader" so the working set can be --- diffed cheaply against a checker's current targets. +-- Returns an ordered array preserving up_conf.nodes order so that targets are +-- always added to a checker deterministically; each entry also carries a +-- "host:port:hostheader" key so the working set can be diffed cheaply against +-- a checker's current targets. local function compute_targets(up_conf) local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port @@ -57,14 +59,14 @@ local function compute_targets(up_conf) local targets = {} for _, node in ipairs(up_conf.nodes) do local host_hdr = up_hdr or (use_node_hdr and node.domain) or nil - local target = { + local target_port = port or node.port + targets[#targets + 1] = { host = node.host, - port = port or node.port, + port = target_port, check_host = host, host_hdr = host_hdr, + key = node.host .. ":" .. tostring(target_port) .. ":" .. tostring(host_hdr or ""), } - local key = target.host .. ":" .. tostring(target.port) .. ":" .. tostring(host_hdr or "") - targets[key] = target end return targets end @@ -97,7 +99,7 @@ local function create_checker(up_conf) end -- Add target nodes - for _, target in pairs(compute_targets(up_conf)) do + for _, target in ipairs(compute_targets(up_conf)) do local ok, err = checker:add_target(target.host, target.port, target.check_host, true, target.host_hdr) if not ok then @@ -115,7 +117,11 @@ end -- so the checker can keep running (and keep its accumulated health state) -- instead of being destroyed and rebuilt. local function sync_checker_targets(checker, up_conf) - local desired = compute_targets(up_conf) + -- index the desired targets by key so they can be diffed against current + local desired = {} + for _, target in ipairs(compute_targets(up_conf)) do + desired[target.key] = target + end -- index current targets the same way as desired. Read the authoritative -- shm target list (the per-worker checker.targets array can lag behind a @@ -280,6 +286,7 @@ local function timer_create_checker() if existing_checker and existing_checker.checker and not existing_checker.checker.dead and upstream.checks + and upstream.nodes and #upstream.nodes > 0 and core.table.deep_eq(existing_checker.checks, upstream.checks) then sync_checker_targets(existing_checker.checker, upstream) add_working_pool(resource_path, resource_ver, existing_checker.checker, @@ -350,11 +357,14 @@ local function timer_working_pool_check() " current version: ", current_ver, " item version: ", item.version) if item.version == current_ver then need_destroy = false - elseif upstream.checks and core.table.deep_eq(item.checks, upstream.checks) then - -- Version changed but only because of the upstream nodes; the - -- `checks` config is identical. Keep the checker alive so - -- timer_create_checker can reconcile its targets incrementally - -- (avoids a destroy-and-rebuild nil window for the checker). + elseif upstream.checks and upstream.nodes and #upstream.nodes > 0 + and core.table.deep_eq(item.checks, upstream.checks) then + -- Version changed but only because of the upstream nodes (and at + -- least one node remains); the `checks` config is identical. Keep + -- the checker alive so timer_create_checker can reconcile its + -- targets incrementally (avoids a destroy-and-rebuild nil window). + -- When the node count drops to 0 we deliberately fall through to + -- destroy the checker, matching the original behaviour. need_destroy = false end end diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t index 9978bba93f2f..57a832bb32e9 100644 --- a/t/node/healthcheck-discovery.t +++ b/t/node/healthcheck-discovery.t @@ -108,7 +108,7 @@ unhealthy TCP increment (1/2) for '127.0.0.1(0.0.0.0:1988)' -=== TEST 2: create new checker when nodes change +=== TEST 2: reuse checker incrementally when nodes change --- apisix_yaml routes: - @@ -150,11 +150,10 @@ routes: } } --- grep_error_log eval -qr/(create new checker|releasing existing checker): table/ +qr/(create new checker|reused checker with incremental targets): table/ --- grep_error_log_out create new checker: table -releasing existing checker: table -create new checker: table +reused checker with incremental targets: table --- timeout: 30 diff --git a/t/node/healthcheck-dns.t b/t/node/healthcheck-dns.t index d0a76dbb636c..cc433e81a162 100644 --- a/t/node/healthcheck-dns.t +++ b/t/node/healthcheck-dns.t @@ -140,6 +140,5 @@ First request status: 200 Second request status: 200 --- error_log create new checker -releasing existing checker -create new checker +reused checker with incremental targets --- timeout: 10 diff --git a/t/node/healthcheck-leak-bugfix.t b/t/node/healthcheck-leak-bugfix.t index bcab5689d152..703ead63adbf 100644 --- a/t/node/healthcheck-leak-bugfix.t +++ b/t/node/healthcheck-leak-bugfix.t @@ -25,7 +25,7 @@ run_tests(); __DATA__ -=== TEST 1: ensure the old check is cleared after configuration updated +=== TEST 1: reuse the checker without clearing it when only the nodes change --- extra_init_worker_by_lua local healthcheck = require("resty.healthcheck") local new = healthcheck.new @@ -110,5 +110,7 @@ location /t { --- request GET /t --- error_log +reused checker with incremental targets +--- no_error_log clear checker --- timeout: 7 diff --git a/t/node/healthcheck-service-discovery.t b/t/node/healthcheck-service-discovery.t index 7b60141531f1..2163f524fe8b 100644 --- a/t/node/healthcheck-service-discovery.t +++ b/t/node/healthcheck-service-discovery.t @@ -149,6 +149,5 @@ routes: } --- error_log create new checker -releasing existing checker -create new checker +reused checker with incremental targets --- timeout: 10 diff --git a/t/node/healthchecker-independent-upstream.t b/t/node/healthchecker-independent-upstream.t index 957aef2c4e52..d6af917073c5 100644 --- a/t/node/healthchecker-independent-upstream.t +++ b/t/node/healthchecker-independent-upstream.t @@ -109,7 +109,3 @@ passed qr/create new checker: table:/ --- grep_error_log_out create new checker: table: -create new checker: table: -create new checker: table: -create new checker: table: -create new checker: table: From fb35d658a1c32847e6a8d898f4a395eb0a504c18 Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Tue, 23 Jun 2026 04:04:38 +0800 Subject: [PATCH 4/5] test(healthcheck): trigger reconcile with a request after re-applying config The incremental-reuse reconcile runs in timer_create_checker, which only acts on resources placed into the waiting pool by fetch_checker (i.e. when a request is routed). Re-route a request after the second PUT so the manager observes the new version and logs the incremental reuse. --- t/node/healthcheck-leak-bugfix.t | 3 +++ 1 file changed, 3 insertions(+) diff --git a/t/node/healthcheck-leak-bugfix.t b/t/node/healthcheck-leak-bugfix.t index 703ead63adbf..9882244bc2cc 100644 --- a/t/node/healthcheck-leak-bugfix.t +++ b/t/node/healthcheck-leak-bugfix.t @@ -103,6 +103,9 @@ location /t { t('/hello', ngx.HTTP_GET) ngx.sleep(2) assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg) < 300) + -- re-route a request so fetch_checker observes the new version and the + -- manager reconciles the existing checker's targets incrementally + t('/hello', ngx.HTTP_GET) ngx.sleep(2) } } From f63d054c44800ef8be0667438027daea65b3b9dc Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Tue, 23 Jun 2026 04:35:05 +0800 Subject: [PATCH 5/5] test(healthcheck): run leak-bugfix at info level so reuse log is captured --- t/node/healthcheck-leak-bugfix.t | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/t/node/healthcheck-leak-bugfix.t b/t/node/healthcheck-leak-bugfix.t index 9882244bc2cc..8b3f6c7b4053 100644 --- a/t/node/healthcheck-leak-bugfix.t +++ b/t/node/healthcheck-leak-bugfix.t @@ -17,7 +17,8 @@ use t::APISIX 'no_plan'; repeat_each(1); -log_level('warn'); +# the reuse path logs "reused checker with incremental targets" at info level +log_level('info'); no_root_location(); no_shuffle();