Fix netenv resolver interaction

Also fix endless loop in tcp resolver when network is down
2025-09-01 10:09:11 +00:00 · 2020-06-26 22:50:35 +02:00 · 2020-06-26 22:50:35 +02:00 · 881a757667
commit 881a757667
parent 111e324d26
4 changed files with 71 additions and 11 deletions
--- a/nameserver/nameserver.go
+++ b/nameserver/nameserver.go
@ -106,12 +106,6 @@ func handleRequestAsWorker(w dns.ResponseWriter, query *dns.Msg) {
 }

 func handleRequest(ctx context.Context, w dns.ResponseWriter, query *dns.Msg) error { //nolint:gocognit // TODO
-	// return with server failure if offline
-	if netenv.GetOnlineStatus() == netenv.StatusOffline {
-		returnServerFailure(w, query)
-		return nil
-	}
-
 	// only process first question, that's how everyone does it.
 	question := query.Question[0]
 	q := &resolver.Query{
@ -119,6 +113,14 @@ func handleRequest(ctx context.Context, w dns.ResponseWriter, query *dns.Msg) er
 		QType: dns.Type(question.Qtype),
 	}

+	// return with server failure if offline
+	if netenv.GetOnlineStatus() == netenv.StatusOffline &&
+		!netenv.IsOnlineStatusTestDomain(q.FQDN) {
+		log.Tracer(ctx).Debugf("resolver: not resolving %s, device is offline", q.FQDN)
+		returnServerFailure(w, query)
+		return nil
+	}
+
 	// check class
 	if question.Qclass != dns.ClassINET {
 		// we only serve IN records, return nxdomain
--- a/netenv/online-status.go
+++ b/netenv/online-status.go
@ -106,6 +106,8 @@ var (

 	captivePortalURL  string
 	captivePortalLock sync.Mutex
+
+	waitForever = make(chan time.Time)
 )

 func init() {
@ -200,12 +202,14 @@ func triggerOnlineStatusInvestigation() {
 }

 func monitorOnlineStatus(ctx context.Context) error {
+	triggerOnlineStatusInvestigation()
 	for {
 		// wait for trigger
 		select {
 		case <-ctx.Done():
 			return nil
 		case <-onlineStatusInvestigationTrigger:
+		case <-getDynamicStatusTrigger():
 		}

 		// enable waiting
@ -221,6 +225,21 @@ func monitorOnlineStatus(ctx context.Context) error {
 	}
 }

+func getDynamicStatusTrigger() <-chan time.Time {
+	switch GetOnlineStatus() {
+	case StatusOffline:
+		return time.After(10 * time.Second)
+	case StatusLimited, StatusPortal:
+		return time.After(1 * time.Minute)
+	case StatusSemiOnline:
+		return time.After(5 * time.Minute)
+	case StatusOnline:
+		return waitForever
+	default: // unknown status
+		return time.After(5 * time.Minute)
+	}
+}
+
 func checkOnlineStatus(ctx context.Context) {
 	// TODO: implement more methods
 	/*status, err := getConnectivityStateFromDbus()
--- a/resolver/resolve.go
+++ b/resolver/resolve.go
@ -7,6 +7,8 @@ import (
 	"sync"
 	"time"

+	"github.com/safing/portmaster/netenv"
+
 	"github.com/miekg/dns"

 	"github.com/safing/portbase/database"
@ -24,6 +26,10 @@ var (
 	ErrLocalhost = errors.New("query for localhost")
 	// ErrTimeout is returned when a query times out
 	ErrTimeout = errors.New("query timed out")
+	// ErrOffline is returned when no network connection is detected
+	ErrOffline = errors.New("device is offine")
+	// ErrFailure is returned when the type of failure is unclear
+	ErrFailure = errors.New("query failed")

 	// detailed errors

@ -213,13 +219,24 @@ func deduplicateRequest(ctx context.Context, q *Query) (finishRequest func()) {
 	}
 }

-func resolveAndCache(ctx context.Context, q *Query) (rrCache *RRCache, err error) {
+func resolveAndCache(ctx context.Context, q *Query) (rrCache *RRCache, err error) { //nolint:gocognit
 	// get resolvers
 	resolvers := GetResolversInScope(ctx, q)
 	if len(resolvers) == 0 {
 		return nil, ErrNoCompliance
 	}

+	// check if we are online
+	if netenv.GetOnlineStatus() == netenv.StatusOffline {
+		if netenv.IsOnlineStatusTestDomain(q.FQDN) {
+			log.Tracer(ctx).Debugf("resolver: permitting online status test domain %s to resolve even though offline", q.FQDN)
+		} else {
+			log.Tracer(ctx).Debugf("resolver: not resolving %s, device is offline", q.FQDN)
+			// we are offline and this is not an online check query
+			return nil, ErrOffline
+		}
+	}
+
 	// start resolving

 	var i int
@ -246,6 +263,11 @@ resolveLoop:
 				case errors.Is(err, ErrBlocked):
 					// some resolvers might also block
 					return nil, err
+				case netenv.GetOnlineStatus() == netenv.StatusOffline &&
+					!netenv.IsOnlineStatusTestDomain(q.FQDN):
+					log.Tracer(ctx).Debugf("resolver: not resolving %s, device is offline", q.FQDN)
+					// we are offline and this is not an online check query
+					return nil, ErrOffline
 				}
 			} else {
 				// no error
--- a/resolver/resolver-tcp.go
+++ b/resolver/resolver-tcp.go
@ -93,11 +93,9 @@ func (tr *TCPResolver) client(workerCtx context.Context) error { //nolint:gocogn
 	var cancelConnCtx func()
 	var recycleConn bool
 	var shuttingDown bool
+	var failCnt int
 	var incoming = make(chan *dns.Msg, 100)

-	// enable client restarting after crash
-	defer tr.clientStarted.UnSet()
-
 connMgmt:
 	for {
 		// cleanup old connection
@ -111,7 +109,7 @@ connMgmt:
 		}

 		// check if we are shutting down or failing
-		if shuttingDown || tr.IsFailing() {
+		if shuttingDown || failCnt >= FailThreshold || tr.IsFailing() {
 			// reply to all waiting queries
 			tr.Lock()
 			for id, inFlight := range tr.inFlightQueries {
@ -181,7 +179,12 @@ connMgmt:
 		c, err := tr.dnsClient.Dial(tr.resolver.ServerAddress)
 		if err != nil {
 			tr.ReportFailure()
+			failCnt++
+			if tr.IsFailing() {
+				shuttingDown = true
+			}
 			log.Debugf("resolver: failed to connect to %s (%s)", tr.resolver.Name, tr.resolver.ServerAddress)
+			netenv.ReportFailedConnection()
 			continue connMgmt
 		}
 		tr.dnsConnection = c
@ -208,6 +211,10 @@ connMgmt:
 					if connClosing.SetToIf(false, true) {
 						cancelConnCtx()
 						tr.ReportFailure()
+						failCnt++
+						if tr.IsFailing() {
+							shuttingDown = true
+						}
 						log.Warningf("resolver: read error from %s (%s): %s", tr.resolver.Name, tr.dnsConnection.RemoteAddr(), err)
 					}
 					return nil
@ -244,6 +251,10 @@ connMgmt:
 					if connClosing.SetToIf(false, true) {
 						cancelConnCtx()
 						tr.ReportFailure()
+						failCnt++
+						if tr.IsFailing() {
+							shuttingDown = true
+						}
 						log.Warningf("resolver: write error to %s (%s): %s", tr.resolver.Name, tr.dnsConnection.RemoteAddr(), err)
 					}
 					continue connMgmt
@ -263,6 +274,7 @@ connMgmt:
 					if ok {
 						select {
 						case inFlight.Response <- msg:
+							failCnt = 0 // reset fail counter
 							// responded!
 						default:
 							// save to cache, if enabled
@ -351,6 +363,11 @@ func (tr *TCPResolver) Query(ctx context.Context, q *Query) (*RRCache, error) {
 		return nil, ErrTimeout
 	}

+	if reply == nil {
+		// Resolver is shutting down, could be server failure or we are offline
+		return nil, ErrFailure
+	}
+
 	if tr.resolver.IsBlockedUpstream(reply) {
 		return nil, &BlockedUpstreamError{tr.resolver.GetName()}
 	}