From ef92b254bf0fbbf36223db14d35b769ac0aaa980 Mon Sep 17 00:00:00 2001 From: Elio Di Nino <52972357+ElioDiNino@users.noreply.github.com> Date: Wed, 18 Feb 2026 10:42:58 -0800 Subject: [PATCH] fix(agent): Retry Docker check on non-200 HTTP response (#1754) The previous behavior only caught some errors including inaccessible hosts, but not others like failed authentication or service unavailability. This largely applies when using a socket proxy and having the retry mitigates some erroneous behavior. --- agent/docker.go | 10 ++-- agent/docker_test.go | 115 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 3 deletions(-) diff --git a/agent/docker.go b/agent/docker.go index f89e7497..13e867c8 100644 --- a/agent/docker.go +++ b/agent/docker.go @@ -594,18 +594,22 @@ func (dm *dockerManager) checkDockerVersion() { const versionMaxTries = 2 for i := 1; i <= versionMaxTries; i++ { resp, err = dm.client.Get("http://localhost/version") - if err == nil { + if err == nil && resp.StatusCode == http.StatusOK { break } if resp != nil { resp.Body.Close() } if i < versionMaxTries { - slog.Debug("Failed to get Docker version; retrying", "attempt", i, "error", err) + if err != nil { + slog.Debug("Failed to get Docker version; retrying", "attempt", i, "error", err) + } else { + slog.Debug("Failed to get Docker version; retrying", "attempt", i, "status code", resp.StatusCode) + } time.Sleep(5 * time.Second) } } - if err != nil { + if err != nil || resp.StatusCode != http.StatusOK { return } if err := dm.decode(resp, &versionInfo); err != nil { diff --git a/agent/docker_test.go b/agent/docker_test.go index 11a1945a..16a8890c 100644 --- a/agent/docker_test.go +++ b/agent/docker_test.go @@ -5,7 +5,13 @@ package agent import ( "bytes" + "context" "encoding/json" + "errors" + "fmt" + "net" + "net/http" + "net/http/httptest" "os" "strings" "testing" @@ -379,6 +385,115 @@ func TestDockerManagerCreation(t *testing.T) { assert.NotNil(t, dm.networkRecvTrackers) } +func TestCheckDockerVersion(t *testing.T) { + tests := []struct { + name string + responses []struct { + statusCode int + body string + } + expectedGood bool + expectedRequests int + }{ + { + name: "200 with good version on first try", + responses: []struct { + statusCode int + body string + }{ + {http.StatusOK, `{"Version":"25.0.1"}`}, + }, + expectedGood: true, + expectedRequests: 1, + }, + { + name: "200 with old version on first try", + responses: []struct { + statusCode int + body string + }{ + {http.StatusOK, `{"Version":"24.0.7"}`}, + }, + expectedGood: false, + expectedRequests: 1, + }, + { + name: "non-200 then 200 with good version", + responses: []struct { + statusCode int + body string + }{ + {http.StatusServiceUnavailable, `"not ready"`}, + {http.StatusOK, `{"Version":"25.1.0"}`}, + }, + expectedGood: true, + expectedRequests: 2, + }, + { + name: "non-200 on all retries", + responses: []struct { + statusCode int + body string + }{ + {http.StatusInternalServerError, `"error"`}, + {http.StatusUnauthorized, `"error"`}, + }, + expectedGood: false, + expectedRequests: 2, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + requestCount := 0 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + idx := requestCount + requestCount++ + if idx >= len(tt.responses) { + idx = len(tt.responses) - 1 + } + w.WriteHeader(tt.responses[idx].statusCode) + fmt.Fprint(w, tt.responses[idx].body) + })) + defer server.Close() + + dm := &dockerManager{ + client: &http.Client{ + Transport: &http.Transport{ + DialContext: func(_ context.Context, network, _ string) (net.Conn, error) { + return net.Dial(network, server.Listener.Addr().String()) + }, + }, + }, + } + + dm.checkDockerVersion() + + assert.Equal(t, tt.expectedGood, dm.goodDockerVersion) + assert.Equal(t, tt.expectedRequests, requestCount) + }) + } + + t.Run("request error on all retries", func(t *testing.T) { + requestCount := 0 + dm := &dockerManager{ + client: &http.Client{ + Transport: &http.Transport{ + DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { + requestCount++ + return nil, errors.New("connection refused") + }, + }, + }, + } + + dm.checkDockerVersion() + + assert.False(t, dm.goodDockerVersion) + assert.Equal(t, 2, requestCount) + }) +} + func TestCycleCpuDeltas(t *testing.T) { dm := &dockerManager{ lastCpuContainer: map[uint16]map[string]uint64{