Use module counters instead of waitgroup when shutting down

2025-04-17 16:09:08 +00:00 · 2021-09-26 13:42:26 +02:00 · 2021-09-26 13:42:26 +02:00 · 483cbad600
commit 483cbad600
parent 92169d826d
4 changed files with 57 additions and 38 deletions
--- a/modules/microtasks.go
+++ b/modules/microtasks.go
@ -130,7 +130,6 @@ func (m *Module) runMicroTask(name *string, fn func(context.Context) error) (err
 	// start for module
 	// hint: only microTasks global var is important for scheduling, others can be set here
 	atomic.AddInt32(m.microTaskCnt, 1)
-	m.waitGroup.Add(1)

 	// set up recovery
 	defer func() {
@ -145,7 +144,7 @@ func (m *Module) runMicroTask(name *string, fn func(context.Context) error) (err

 		// finish for module
 		atomic.AddInt32(m.microTaskCnt, -1)
-		m.waitGroup.Done()
+		m.checkIfStopComplete()

 		// finish and possibly trigger next task
 		atomic.AddInt32(microTasks, -1)
--- a/modules/modules.go
+++ b/modules/modules.go
@ -52,15 +52,16 @@ type Module struct { //nolint:maligned // not worth the effort
 	// start
 	startComplete chan struct{}
 	// stop
-	Ctx       context.Context
-	cancelCtx func()
-	stopFlag  *abool.AtomicBool
+	Ctx          context.Context
+	cancelCtx    func()
+	stopFlag     *abool.AtomicBool
+	stopComplete chan struct{}

 	// workers/tasks
-	workerCnt    *int32
-	taskCnt      *int32
-	microTaskCnt *int32
-	waitGroup    sync.WaitGroup
+	ctrlFuncRunning *abool.AtomicBool
+	workerCnt       *int32
+	taskCnt         *int32
+	microTaskCnt    *int32

 	// events
 	eventHooks     map[string]*eventHooks
@ -205,6 +206,23 @@ func (m *Module) start(reports chan *report) {
 	}()
 }

+func (m *Module) checkIfStopComplete() {
+	if m.stopFlag.IsSet() &&
+		m.ctrlFuncRunning.IsNotSet() &&
+		atomic.LoadInt32(m.workerCnt) == 0 &&
+		atomic.LoadInt32(m.taskCnt) == 0 &&
+		atomic.LoadInt32(m.microTaskCnt) == 0 {
+
+		m.Lock()
+		defer m.Unlock()
+
+		if m.stopComplete != nil {
+			close(m.stopComplete)
+			m.stopComplete = nil
+		}
+	}
+}
+
 func (m *Module) stop(reports chan *report) {
 	// check and set intermediate status
 	m.Lock()
@ -218,47 +236,48 @@ func (m *Module) stop(reports chan *report) {
 		}()
 		return
 	}
-	m.status = StatusStopping

-	// reset start management
+	// Reset start/stop signal channels.
 	m.startComplete = make(chan struct{})
-	// init stop management
-	m.cancelCtx()
+	m.stopComplete = make(chan struct{})
+
+	// Make a copy of the stop channel.
+	stopComplete := m.stopComplete
+
+	// Set status and cancel context.
+	m.status = StatusStopping
 	m.stopFlag.Set()
+	m.cancelCtx()

 	m.Unlock()

-	go m.stopAllTasks(reports)
+	go m.stopAllTasks(reports, stopComplete)
 }

-func (m *Module) stopAllTasks(reports chan *report) {
+func (m *Module) stopAllTasks(reports chan *report, stopComplete chan struct{}) {
 	// start shutdown function
-	stopFnFinished := abool.NewBool(false)
 	var stopFnError error
+	stopFuncRunning := abool.New()
 	if m.stopFn != nil {
-		m.waitGroup.Add(1)
+		stopFuncRunning.Set()
 		go func() {
 			stopFnError = m.runCtrlFn("stop module", m.stopFn)
-			stopFnFinished.Set()
-			m.waitGroup.Done()
+			stopFuncRunning.UnSet()
+			m.checkIfStopComplete()
 		}()
+	} else {
+		m.checkIfStopComplete()
 	}

-	// wait for workers and stop fn
-	done := make(chan struct{})
-	go func() {
-		m.waitGroup.Wait()
-		close(done)
-	}()
-
 	// wait for results
 	select {
-	case <-done:
-	case <-time.After(moduleStopTimeout):
+	case <-stopComplete:
+	// case <-time.After(moduleStopTimeout):
+	case <-time.After(3 * time.Second):
 		log.Warningf(
-			"%s: timed out while waiting for stopfn/workers/tasks to finish: stopFn=%v workers=%d tasks=%d microtasks=%d, continuing shutdown...",
+			"%s: timed out while waiting for stopfn/workers/tasks to finish: stopFn=%v/%v workers=%d tasks=%d microtasks=%d, continuing shutdown...",
 			m.Name,
-			stopFnFinished.IsSet(),
+			stopFuncRunning.IsSet(), m.ctrlFuncRunning.IsSet(),
 			atomic.LoadInt32(m.workerCnt),
 			atomic.LoadInt32(m.taskCnt),
 			atomic.LoadInt32(m.microTaskCnt),
@ -267,7 +286,7 @@ func (m *Module) stopAllTasks(reports chan *report) {

 	// collect error
 	var err error
-	if stopFnFinished.IsSet() && stopFnError != nil {
+	if stopFuncRunning.IsNotSet() && stopFnError != nil {
 		err = stopFnError
 	}
 	// set status
@ -328,10 +347,10 @@ func initNewModule(name string, prep, start, stop func() error, dependencies ...
 		Ctx:                 ctx,
 		cancelCtx:           cancelCtx,
 		stopFlag:            abool.NewBool(false),
+		ctrlFuncRunning:     abool.NewBool(false),
 		workerCnt:           &workerCnt,
 		taskCnt:             &taskCnt,
 		microTaskCnt:        &microTaskCnt,
-		waitGroup:           sync.WaitGroup{},
 		eventHooks:          make(map[string]*eventHooks),
 		depNames:            dependencies,
 	}
--- a/modules/tasks.go
+++ b/modules/tasks.go
@ -330,7 +330,6 @@ func (t *Task) executeWithLocking() {
 	// start for module
 	// hint: only queueWg global var is important for scheduling, others can be set here
 	atomic.AddInt32(t.module.taskCnt, 1)
-	t.module.waitGroup.Add(1)

 	defer func() {
 		// recover from panic
@ -343,7 +342,7 @@ func (t *Task) executeWithLocking() {

 		// finish for module
 		atomic.AddInt32(t.module.taskCnt, -1)
-		t.module.waitGroup.Done()
+		t.module.checkIfStopComplete()

 		t.lock.Lock()

--- a/modules/worker.go
+++ b/modules/worker.go
@ -39,10 +39,9 @@ func (m *Module) RunWorker(name string, fn func(context.Context) error) error {
 	}

 	atomic.AddInt32(m.workerCnt, 1)
-	m.waitGroup.Add(1)
 	defer func() {
 		atomic.AddInt32(m.workerCnt, -1)
-		m.waitGroup.Done()
+		m.checkIfStopComplete()
 	}()

 	return m.runWorker(name, fn)
@ -60,10 +59,9 @@ func (m *Module) StartServiceWorker(name string, backoffDuration time.Duration,

 func (m *Module) runServiceWorker(name string, backoffDuration time.Duration, fn func(context.Context) error) {
 	atomic.AddInt32(m.workerCnt, 1)
-	m.waitGroup.Add(1)
 	defer func() {
 		atomic.AddInt32(m.workerCnt, -1)
-		m.waitGroup.Done()
+		m.checkIfStopComplete()
 	}()

 	if backoffDuration == 0 {
@ -143,6 +141,10 @@ func (m *Module) runCtrlFn(name string, fn func() error) (err error) {
 		return
 	}

+	if m.ctrlFuncRunning.SetToIf(false, true) {
+		defer m.ctrlFuncRunning.SetToIf(true, false)
+	}
+
 	defer func() {
 		// recover from panic
 		panicVal := recover()