mirror of
https://github.com/safing/portbase
synced 2025-04-23 02:39:09 +00:00
Merge pull request #224 from safing/fix/metrics-modules-api
Fix metrics, modules, api
This commit is contained in:
commit
83b709526e
10 changed files with 133 additions and 116 deletions
|
@ -151,7 +151,7 @@ func authenticateRequest(w http.ResponseWriter, r *http.Request, targetHandler h
|
|||
switch requiredPermission { //nolint:exhaustive
|
||||
case NotFound:
|
||||
// Not found.
|
||||
tracer.Trace("api: authenticated handler reported: not found")
|
||||
tracer.Debug("api: no API endpoint registered for this path")
|
||||
http.Error(w, "Not found.", http.StatusNotFound)
|
||||
return nil
|
||||
case NotSupported:
|
||||
|
|
|
@ -235,6 +235,7 @@ func (mh *mainHandler) handle(w http.ResponseWriter, r *http.Request) error {
|
|||
http.Error(lrw, "Method not allowed.", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
default:
|
||||
tracer.Debug("api: no handler registered for this path")
|
||||
http.Error(lrw, "Not found.", http.StatusNotFound)
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -11,7 +11,6 @@ import (
|
|||
"github.com/safing/portbase/api"
|
||||
"github.com/safing/portbase/config"
|
||||
"github.com/safing/portbase/log"
|
||||
"github.com/safing/portbase/utils"
|
||||
)
|
||||
|
||||
func registerAPI() error {
|
||||
|
@ -140,11 +139,7 @@ func writeMetricsTo(ctx context.Context, url string) error {
|
|||
)
|
||||
}
|
||||
|
||||
var metricsPusherDone = utils.NewBroadcastFlag()
|
||||
|
||||
func metricsWriter(ctx context.Context) error {
|
||||
defer metricsPusherDone.NotifyAndReset()
|
||||
|
||||
pushURL := pushOption()
|
||||
ticker := module.NewSleepyTicker(1*time.Minute, 0)
|
||||
defer ticker.Stop()
|
||||
|
|
|
@ -16,7 +16,7 @@ import (
|
|||
|
||||
const hostStatTTL = 1 * time.Second
|
||||
|
||||
func registeHostMetrics() (err error) {
|
||||
func registerHostMetrics() (err error) {
|
||||
// Register load average metrics.
|
||||
_, err = NewGauge("host/load/avg/1", nil, getFloat64HostStat(LoadAvg1), &Options{Name: "Host Load Avg 1min", Permission: api.PermitUser})
|
||||
if err != nil {
|
||||
|
|
|
@ -3,10 +3,13 @@ package metrics
|
|||
import (
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/safing/portbase/info"
|
||||
)
|
||||
|
||||
var reportedStart atomic.Bool
|
||||
|
||||
func registerInfoMetric() error {
|
||||
meta := info.GetInfo()
|
||||
_, err := NewGauge(
|
||||
|
@ -26,6 +29,10 @@ func registerInfoMetric() error {
|
|||
"comment": commentOption(),
|
||||
},
|
||||
func() float64 {
|
||||
// Report as 0 the first time in order to detect (re)starts.
|
||||
if reportedStart.CompareAndSwap(false, true) {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
},
|
||||
nil,
|
||||
|
|
|
@ -5,7 +5,7 @@ import (
|
|||
"github.com/safing/portbase/log"
|
||||
)
|
||||
|
||||
func registeLogMetrics() (err error) {
|
||||
func registerLogMetrics() (err error) {
|
||||
_, err = NewFetchingCounter(
|
||||
"logs/warning/total",
|
||||
nil,
|
||||
|
|
|
@ -5,7 +5,6 @@ import (
|
|||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/safing/portbase/modules"
|
||||
)
|
||||
|
@ -59,11 +58,11 @@ func start() error {
|
|||
return err
|
||||
}
|
||||
|
||||
if err := registeHostMetrics(); err != nil {
|
||||
if err := registerHostMetrics(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := registeLogMetrics(); err != nil {
|
||||
if err := registerLogMetrics(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -82,16 +81,13 @@ func stop() error {
|
|||
// Wait until the metrics pusher is done, as it may have started reporting
|
||||
// and may report a higher number than we store to disk. For persistent
|
||||
// metrics it can then happen that the first report is lower than the
|
||||
// previous report, making prometheus think that al that happened since the
|
||||
// previous report, making prometheus think that all that happened since the
|
||||
// last report, due to the automatic restart detection.
|
||||
done := metricsPusherDone.NewFlag()
|
||||
done.Refresh()
|
||||
if !done.IsSet() {
|
||||
select {
|
||||
case <-done.Signal():
|
||||
case <-time.After(10 * time.Second):
|
||||
}
|
||||
}
|
||||
|
||||
// The registry is read locked when writing metrics.
|
||||
// Write lock the registry to make sure all writes are finished.
|
||||
registryLock.Lock()
|
||||
registryLock.Unlock() //nolint:staticcheck
|
||||
|
||||
storePersistentMetrics()
|
||||
|
||||
|
@ -120,6 +116,10 @@ func register(m Metric) error {
|
|||
// Set flag that first metric is now registered.
|
||||
firstMetricRegistered = true
|
||||
|
||||
if module.Status() < modules.StatusStarting {
|
||||
return fmt.Errorf("registering metric %q too early", m.ID())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ var (
|
|||
})
|
||||
|
||||
// ErrAlreadyInitialized is returned when trying to initialize an option
|
||||
// more than once.
|
||||
// more than once or if the time window for initializing is over.
|
||||
ErrAlreadyInitialized = errors.New("already initialized")
|
||||
)
|
||||
|
||||
|
@ -55,7 +55,7 @@ func EnableMetricPersistence(key string) error {
|
|||
|
||||
// Load metrics from storage.
|
||||
var err error
|
||||
storage, err = getMetricsStorage(key)
|
||||
storage, err = getMetricsStorage(storageKey)
|
||||
switch {
|
||||
case err == nil:
|
||||
// Continue.
|
||||
|
|
|
@ -57,10 +57,11 @@ type Module struct { //nolint:maligned
|
|||
// start
|
||||
startComplete chan struct{}
|
||||
// stop
|
||||
Ctx context.Context
|
||||
cancelCtx func()
|
||||
stopFlag *abool.AtomicBool
|
||||
stopComplete chan struct{}
|
||||
Ctx context.Context
|
||||
cancelCtx func()
|
||||
stopFlag *abool.AtomicBool
|
||||
stopCompleted *abool.AtomicBool
|
||||
stopComplete chan struct{}
|
||||
|
||||
// workers/tasks
|
||||
ctrlFuncRunning *abool.AtomicBool
|
||||
|
@ -255,12 +256,10 @@ func (m *Module) checkIfStopComplete() {
|
|||
atomic.LoadInt32(m.taskCnt) == 0 &&
|
||||
atomic.LoadInt32(m.microTaskCnt) == 0 {
|
||||
|
||||
m.Lock()
|
||||
defer m.Unlock()
|
||||
|
||||
if m.stopComplete != nil {
|
||||
if m.stopCompleted.SetToIf(false, true) {
|
||||
m.Lock()
|
||||
defer m.Unlock()
|
||||
close(m.stopComplete)
|
||||
m.stopComplete = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -283,60 +282,56 @@ func (m *Module) stop(reports chan *report) {
|
|||
// Reset start/stop signal channels.
|
||||
m.startComplete = make(chan struct{})
|
||||
m.stopComplete = make(chan struct{})
|
||||
m.stopCompleted.SetTo(false)
|
||||
|
||||
// Make a copy of the stop channel.
|
||||
stopComplete := m.stopComplete
|
||||
|
||||
// Set status and cancel context.
|
||||
// Set status.
|
||||
m.status = StatusStopping
|
||||
m.stopFlag.Set()
|
||||
m.cancelCtx()
|
||||
|
||||
go m.stopAllTasks(reports, stopComplete)
|
||||
go m.stopAllTasks(reports)
|
||||
}
|
||||
|
||||
func (m *Module) stopAllTasks(reports chan *report, stopComplete chan struct{}) {
|
||||
// start shutdown function
|
||||
var stopFnError error
|
||||
stopFuncRunning := abool.New()
|
||||
if m.stopFn != nil {
|
||||
stopFuncRunning.Set()
|
||||
go func() {
|
||||
stopFnError = m.runCtrlFn("stop module", m.stopFn)
|
||||
stopFuncRunning.UnSet()
|
||||
m.checkIfStopComplete()
|
||||
}()
|
||||
} else {
|
||||
m.checkIfStopComplete()
|
||||
}
|
||||
func (m *Module) stopAllTasks(reports chan *report) {
|
||||
// Manually set the control function flag in order to stop completion by race
|
||||
// condition before stop function has even started.
|
||||
m.ctrlFuncRunning.Set()
|
||||
|
||||
// Set stop flag for everyone checking this flag before we activate any stop trigger.
|
||||
m.stopFlag.Set()
|
||||
|
||||
// Cancel the context to notify all workers and tasks.
|
||||
m.cancelCtx()
|
||||
|
||||
// Start stop function.
|
||||
stopFnError := m.startCtrlFn("stop module", m.stopFn)
|
||||
|
||||
// wait for results
|
||||
select {
|
||||
case <-stopComplete:
|
||||
// case <-time.After(moduleStopTimeout):
|
||||
case <-m.stopComplete:
|
||||
// Complete!
|
||||
case <-time.After(moduleStopTimeout):
|
||||
log.Warningf(
|
||||
"%s: timed out while waiting for stopfn/workers/tasks to finish: stopFn=%v/%v workers=%d tasks=%d microtasks=%d, continuing shutdown...",
|
||||
"%s: timed out while waiting for stopfn/workers/tasks to finish: stopFn=%v workers=%d tasks=%d microtasks=%d, continuing shutdown...",
|
||||
m.Name,
|
||||
stopFuncRunning.IsSet(), m.ctrlFuncRunning.IsSet(),
|
||||
m.ctrlFuncRunning.IsSet(),
|
||||
atomic.LoadInt32(m.workerCnt),
|
||||
atomic.LoadInt32(m.taskCnt),
|
||||
atomic.LoadInt32(m.microTaskCnt),
|
||||
)
|
||||
}
|
||||
|
||||
// collect error
|
||||
// Check for stop fn status.
|
||||
var err error
|
||||
if stopFuncRunning.IsNotSet() && stopFnError != nil {
|
||||
err = stopFnError
|
||||
}
|
||||
// set status
|
||||
if err != nil {
|
||||
m.Error(
|
||||
fmt.Sprintf("%s:stop-failed", m.Name),
|
||||
fmt.Sprintf("Stopping module %s failed", m.Name),
|
||||
fmt.Sprintf("Failed to stop module: %s", err.Error()),
|
||||
)
|
||||
select {
|
||||
case err = <-stopFnError:
|
||||
if err != nil {
|
||||
// Set error as module error.
|
||||
m.Error(
|
||||
fmt.Sprintf("%s:stop-failed", m.Name),
|
||||
fmt.Sprintf("Stopping module %s failed", m.Name),
|
||||
fmt.Sprintf("Failed to stop module: %s", err.Error()),
|
||||
)
|
||||
}
|
||||
default:
|
||||
}
|
||||
|
||||
// Always set to offline in order to let other modules shutdown in order.
|
||||
|
@ -384,7 +379,7 @@ func initNewModule(name string, prep, start, stop func() error, dependencies ...
|
|||
Name: name,
|
||||
enabled: abool.NewBool(false),
|
||||
enabledAsDependency: abool.NewBool(false),
|
||||
sleepMode: abool.NewBool(true),
|
||||
sleepMode: abool.NewBool(true), // Change (for init) is triggered below.
|
||||
sleepWaitingChannel: make(chan time.Time),
|
||||
prepFn: prep,
|
||||
startFn: start,
|
||||
|
@ -393,6 +388,7 @@ func initNewModule(name string, prep, start, stop func() error, dependencies ...
|
|||
Ctx: ctx,
|
||||
cancelCtx: cancelCtx,
|
||||
stopFlag: abool.NewBool(false),
|
||||
stopCompleted: abool.NewBool(true),
|
||||
ctrlFuncRunning: abool.NewBool(false),
|
||||
workerCnt: &workerCnt,
|
||||
taskCnt: &taskCnt,
|
||||
|
@ -401,7 +397,7 @@ func initNewModule(name string, prep, start, stop func() error, dependencies ...
|
|||
depNames: dependencies,
|
||||
}
|
||||
|
||||
// Sleep mode is disabled by default
|
||||
// Sleep mode is disabled by default.
|
||||
newModule.Sleep(false)
|
||||
|
||||
return newModule
|
||||
|
|
|
@ -53,6 +53,7 @@ func (m *Module) RunWorker(name string, fn func(context.Context) error) error {
|
|||
}
|
||||
|
||||
// StartServiceWorker starts a generic worker, which is automatically restarted in case of an error. A call to StartServiceWorker runs the service-worker in a new goroutine and returns immediately. `backoffDuration` specifies how to long to wait before restarts, multiplied by the number of failed attempts. Pass `0` for the default backoff duration. For custom error remediation functionality, build your own error handling procedure using calls to RunWorker.
|
||||
// Returning nil error or context.Canceled will stop the service worker.
|
||||
func (m *Module) StartServiceWorker(name string, backoffDuration time.Duration, fn func(context.Context) error) {
|
||||
if m == nil {
|
||||
log.Errorf(`modules: cannot start service worker "%s" with nil module`, name)
|
||||
|
@ -81,34 +82,36 @@ func (m *Module) runServiceWorker(name string, backoffDuration time.Duration, fn
|
|||
}
|
||||
|
||||
err := m.runWorker(name, fn)
|
||||
if err != nil {
|
||||
if !errors.Is(err, ErrRestartNow) {
|
||||
// reset fail counter if running without error for some time
|
||||
if time.Now().Add(-5 * time.Minute).After(lastFail) {
|
||||
failCnt = 0
|
||||
}
|
||||
// increase fail counter and set last failed time
|
||||
failCnt++
|
||||
lastFail = time.Now()
|
||||
// log error
|
||||
sleepFor := time.Duration(failCnt) * backoffDuration
|
||||
if errors.Is(err, context.Canceled) {
|
||||
log.Debugf("%s: service-worker %s was canceled (%d): %s - restarting in %s", m.Name, name, failCnt, err, sleepFor)
|
||||
} else {
|
||||
log.Errorf("%s: service-worker %s failed (%d): %s - restarting in %s", m.Name, name, failCnt, err, sleepFor)
|
||||
}
|
||||
select {
|
||||
case <-time.After(sleepFor):
|
||||
case <-m.Ctx.Done():
|
||||
return
|
||||
}
|
||||
// loop to restart
|
||||
} else {
|
||||
log.Infof("%s: service-worker %s %s - restarting now", m.Name, name, err)
|
||||
}
|
||||
} else {
|
||||
// finish
|
||||
switch {
|
||||
case err == nil:
|
||||
// No error means that the worker is finished.
|
||||
return
|
||||
|
||||
case errors.Is(err, context.Canceled):
|
||||
// A canceled context also means that the worker is finished.
|
||||
return
|
||||
|
||||
case errors.Is(err, ErrRestartNow):
|
||||
// Worker requested a restart - silently continue with loop.
|
||||
|
||||
default:
|
||||
// Any other errors triggers a restart with backoff.
|
||||
|
||||
// Reset fail counter if running without error for some time.
|
||||
if time.Now().Add(-5 * time.Minute).After(lastFail) {
|
||||
failCnt = 0
|
||||
}
|
||||
// Increase fail counter and set last failed time.
|
||||
failCnt++
|
||||
lastFail = time.Now()
|
||||
// Log error and back off for some time.
|
||||
sleepFor := time.Duration(failCnt) * backoffDuration
|
||||
log.Errorf("%s: service-worker %s failed (%d): %s - restarting in %s", m.Name, name, failCnt, err, sleepFor)
|
||||
select {
|
||||
case <-time.After(sleepFor):
|
||||
case <-m.Ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -132,10 +135,7 @@ func (m *Module) runWorker(name string, fn func(context.Context) error) (err err
|
|||
}
|
||||
|
||||
func (m *Module) runCtrlFnWithTimeout(name string, timeout time.Duration, fn func() error) error {
|
||||
stopFnError := make(chan error)
|
||||
go func() {
|
||||
stopFnError <- m.runCtrlFn(name, fn)
|
||||
}()
|
||||
stopFnError := m.startCtrlFn(name, fn)
|
||||
|
||||
// wait for results
|
||||
select {
|
||||
|
@ -146,26 +146,44 @@ func (m *Module) runCtrlFnWithTimeout(name string, timeout time.Duration, fn fun
|
|||
}
|
||||
}
|
||||
|
||||
func (m *Module) runCtrlFn(name string, fn func() error) (err error) {
|
||||
func (m *Module) startCtrlFn(name string, fn func() error) chan error {
|
||||
ctrlFnError := make(chan error, 1)
|
||||
|
||||
// If no function is given, still act as if it was run.
|
||||
if fn == nil {
|
||||
return
|
||||
// Signal finish.
|
||||
m.ctrlFuncRunning.UnSet()
|
||||
m.checkIfStopComplete()
|
||||
|
||||
// Report nil error and return.
|
||||
ctrlFnError <- nil
|
||||
return ctrlFnError
|
||||
}
|
||||
|
||||
if m.ctrlFuncRunning.SetToIf(false, true) {
|
||||
defer m.ctrlFuncRunning.SetToIf(true, false)
|
||||
}
|
||||
// Signal that a control function is running.
|
||||
m.ctrlFuncRunning.Set()
|
||||
|
||||
defer func() {
|
||||
// recover from panic
|
||||
panicVal := recover()
|
||||
if panicVal != nil {
|
||||
me := m.NewPanicError(name, "module-control", panicVal)
|
||||
me.Report()
|
||||
err = me
|
||||
}
|
||||
// Start control function in goroutine.
|
||||
go func() {
|
||||
// Recover from panic and reset control function signal.
|
||||
defer func() {
|
||||
// recover from panic
|
||||
panicVal := recover()
|
||||
if panicVal != nil {
|
||||
me := m.NewPanicError(name, "module-control", panicVal)
|
||||
me.Report()
|
||||
ctrlFnError <- fmt.Errorf("panic: %s", panicVal)
|
||||
}
|
||||
|
||||
// Signal finish.
|
||||
m.ctrlFuncRunning.UnSet()
|
||||
m.checkIfStopComplete()
|
||||
}()
|
||||
|
||||
// Run control function and report error.
|
||||
err := fn()
|
||||
ctrlFnError <- err
|
||||
}()
|
||||
|
||||
// run
|
||||
err = fn()
|
||||
return
|
||||
return ctrlFnError
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue