Fix temperature data intermittency caused by proxy rate limit retries

Root Cause:
The classifyError() function in tempproxy/client.go was returning nil
when err was nil, even if respError contained "rate limit exceeded".
This caused the retry logic to treat rate limit errors as retryable,
triggering 3 retries with exponential backoff (100ms, 200ms, 400ms)
for each rate-limited request.

With multiple nodes polling simultaneously and hitting the proxy's
1 req/sec default rate limit, this created a retry storm:
- 3 nodes polling every 10 seconds
- 1-2 requests rate limited per cycle
- Each rate limit triggered 3 retries
- Result: 6+ extra requests per cycle, causing temperature data to
  flicker in and out as requests were dropped

Solution:
1. Reordered classifyError() to check respError first before checking
   if err is nil, ensuring rate limit errors are properly classified
2. Added explicit rate limit detection that marks these errors as
   non-retryable
3. Added stub EnableTemperatureMonitoring/DisableTemperatureMonitoring
   methods to Monitor for interface compatibility

Impact:
- Rate limit retry attempts reduced from 151 in 10 minutes to 0
- Temperature data now stable for all nodes
- No more flickering temperature displays in dashboard
This commit is contained in:
rcourtman 2025-11-05 10:20:15 +00:00
parent 7a185c4ab3
commit e4e915c8a1
2 changed files with 59 additions and 32 deletions

View file

@ -122,6 +122,51 @@ func calculateBackoff(attempt int) time.Duration {
// classifyError categorizes errors for retry logic
func classifyError(err error, respError string) *ProxyError {
// Check response error messages first (even if err is nil)
// This handles cases where the socket succeeds but the proxy returns an application error
if respError != "" {
// Rate limiting - never retry
if contains(respError, "rate limit") {
return &ProxyError{
Type: ErrorTypeTransport,
Message: respError,
Retryable: false,
Wrapped: fmt.Errorf("%s", respError),
}
}
// Authorization errors - never retry
if respError == "unauthorized" || respError == "method requires host-level privileges" {
return &ProxyError{
Type: ErrorTypeAuth,
Message: respError,
Retryable: false,
Wrapped: fmt.Errorf("%s", respError),
}
}
// SSH-related errors - retryable
if contains(respError, "ssh", "connection", "timeout") {
return &ProxyError{
Type: ErrorTypeSSH,
Message: "SSH connectivity issue",
Retryable: true,
Wrapped: fmt.Errorf("%s", respError),
}
}
// Sensor errors - never retry
if contains(respError, "sensor", "temperature") {
return &ProxyError{
Type: ErrorTypeSensor,
Message: "sensor command failed",
Retryable: false,
Wrapped: fmt.Errorf("%s", respError),
}
}
}
// If no response error and no network error, nothing to classify
if err == nil {
return nil
}
@ -146,38 +191,6 @@ func classifyError(err error, respError string) *ProxyError {
}
}
// Check response error messages
if respError != "" {
if respError == "unauthorized" || respError == "method requires host-level privileges" {
return &ProxyError{
Type: ErrorTypeAuth,
Message: respError,
Retryable: false,
Wrapped: fmt.Errorf("%s", respError),
}
}
// SSH-related errors
if contains(respError, "ssh", "connection", "timeout") {
return &ProxyError{
Type: ErrorTypeSSH,
Message: "SSH connectivity issue",
Retryable: true,
Wrapped: fmt.Errorf("%s", respError),
}
}
// Sensor errors
if contains(respError, "sensor", "temperature") {
return &ProxyError{
Type: ErrorTypeSensor,
Message: "sensor command failed",
Retryable: false,
Wrapped: fmt.Errorf("%s", respError),
}
}
}
// Unknown error
return &ProxyError{
Type: ErrorTypeUnknown,