DockFlare/app.py

import os
import sys
import logging
import re
import json
import threading
import time
from datetime import datetime, timedelta, timezone
import random

import docker
from docker.errors import NotFound, APIError
# Updated import: Added render_template
from flask import Flask, jsonify, render_template, redirect, url_for, request
from dotenv import load_dotenv
import requests

# --- Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] [%(threadName)s] %(message)s')
load_dotenv()

# Retry Config for CF PUT Tunnel Config
MAX_CF_UPDATE_RETRIES = 3
CF_UPDATE_RETRY_DELAY = 2
CF_UPDATE_BACKOFF_FACTOR = 2

# Cloudflare Config
CF_API_TOKEN = os.getenv('CF_API_TOKEN')
TUNNEL_NAME = os.getenv('TUNNEL_NAME')
CF_ACCOUNT_ID = os.getenv('CF_ACCOUNT_ID')
CF_ZONE_ID = os.getenv('CF_ZONE_ID') # Added Zone ID
CF_API_BASE_URL = "https://api.cloudflare.com/client/v4"
CF_HEADERS = {
    "Authorization": f"Bearer {CF_API_TOKEN}",
    "Content-Type": "application/json",
}
# ADDED DEBUG LOGGING HERE
logging.info(f"[DEBUG] CF_HEADERS created: Authorization Header starts with 'Bearer {str(CF_API_TOKEN)[:5]}...'")

# App Config
LABEL_PREFIX = os.getenv('LABEL_PREFIX', 'cloudflare.tunnel')
GRACE_PERIOD_SECONDS = int(os.getenv('GRACE_PERIOD_SECONDS', 28800))
CLEANUP_INTERVAL_SECONDS = int(os.getenv('CLEANUP_INTERVAL_SECONDS', 300))
STATE_FILE_PATH = os.getenv('STATE_FILE_PATH', '/app/data/state.json')

# Cloudflared Agent Config
CLOUDFLARED_CONTAINER_NAME = os.getenv('CLOUDFLARED_CONTAINER_NAME', f"cloudflared-agent-{TUNNEL_NAME}")
CLOUDFLARED_IMAGE = "cloudflare/cloudflared:latest"
CLOUDFLARED_NETWORK_NAME = os.getenv('CLOUDFLARED_NETWORK_NAME', 'cloudflare-net')

# Environment Variable Checks
if not CF_API_TOKEN or not TUNNEL_NAME or not CF_ACCOUNT_ID or not CF_ZONE_ID: # Added CF_ZONE_ID
    logging.error("FATAL: Missing required environment variables (CF_API_TOKEN, TUNNEL_NAME, CF_ACCOUNT_ID, CF_ZONE_ID)") # Added CF_ZONE_ID
    sys.exit(1)

# Docker Client Setup
try:
    docker_client = docker.from_env(timeout=10)
    docker_client.ping()
    logging.info("Successfully connected to Docker daemon.")
except Exception as e:
    logging.error(f"FATAL: Failed to connect to Docker daemon: {e}")
    docker_client = None

# Global State
tunnel_state = { "name": TUNNEL_NAME, "id": None, "token": None, "status_message": "Initializing...", "error": None }
cloudflared_agent_state = { "container_status": "unknown", "last_action_status": None }
managed_rules = {}
state_lock = threading.Lock()
stop_event = threading.Event()


# --- load_state ---
def load_state():
    global managed_rules
    state_dir = os.path.dirname(STATE_FILE_PATH)
    if not os.path.exists(state_dir):
        try:
             os.makedirs(state_dir, exist_ok=True)
             logging.info(f"Created directory for state file: {state_dir}")
        except OSError as e:
             logging.error(f"FATAL: Could not create directory for state file {state_dir}: {e}. State persistence will fail.")
             managed_rules = {}
             return

    if not os.path.exists(STATE_FILE_PATH):
        logging.info(f"State file '{STATE_FILE_PATH}' not found, starting fresh.")
        managed_rules = {}
        return
    try:
        with open(STATE_FILE_PATH, 'r') as f:
            loaded_data = json.load(f)
        for hostname, rule in loaded_data.items():
             # Ensure delete_at is converted back to datetime object
             if rule.get("delete_at") and isinstance(rule.get("delete_at"), str):
                 try:
                     # Handle both ISO 8601 formats (with Z and without)
                     if rule["delete_at"].endswith('Z'):
                        rule["delete_at"] = datetime.fromisoformat(rule["delete_at"].replace('Z', '+00:00'))
                     else:
                         # Attempt parsing potentially offset-naive string, assume UTC if naive
                         dt = datetime.fromisoformat(rule["delete_at"])
                         rule["delete_at"] = dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt.astimezone(timezone.utc)
                 except ValueError as date_err:
                     logging.warning(f"Could not parse delete_at for {hostname}: {rule['delete_at']} Error: {date_err}. Setting to None.")
                     rule["delete_at"] = None
             elif not isinstance(rule.get("delete_at"), datetime):
                 rule["delete_at"] = None # Ensure it's None if not a valid string or already None
        managed_rules = loaded_data
        logging.info(f"Loaded state for {len(managed_rules)} rules from {STATE_FILE_PATH}")
    except (json.JSONDecodeError, IOError, OSError) as e:
        logging.error(f"Error loading state from {STATE_FILE_PATH}: {e}. Starting fresh.", exc_info=True)
        managed_rules = {}


# --- save_state ---
def save_state():
    serializable_state = {}
    for hostname, rule in managed_rules.items():
        rule_copy = rule.copy()
        # Ensure datetime is converted to ISO 8601 string with Z for UTC
        if rule_copy.get("delete_at") and isinstance(rule_copy["delete_at"], datetime):
            dt_utc = rule_copy["delete_at"].astimezone(timezone.utc)
            # Format to ISO 8601 with Z, remove microseconds for cleaner output
            rule_copy["delete_at"] = dt_utc.strftime('%Y-%m-%dT%H:%M:%SZ')
        serializable_state[hostname] = rule_copy
    try:
        state_dir = os.path.dirname(STATE_FILE_PATH)
        if not os.path.exists(state_dir):
            try: os.makedirs(state_dir, exist_ok=True); logging.info(f"Created directory {state_dir} before saving state.")
            except OSError as e: logging.error(f"Could not create directory {state_dir} for state file: {e}. Save failed."); return

        temp_file_path = STATE_FILE_PATH + ".tmp"
        with open(temp_file_path, 'w') as f:
            json.dump(serializable_state, f, indent=2)
        os.replace(temp_file_path, STATE_FILE_PATH)
        logging.debug(f"Saved state for {len(managed_rules)} rules to {STATE_FILE_PATH}")
    except (IOError, OSError) as e:
        logging.error(f"Error saving state to {STATE_FILE_PATH}: {e}", exc_info=True)


# --- cf_api_request ---
def cf_api_request(method, endpoint, json_data=None, params=None):
    url = f"{CF_API_BASE_URL}{endpoint}"
    error_msg = None
    try:
        # Use a copy of headers to avoid potential modification issues if needed elsewhere
        # Although in this case, CF_HEADERS is constant after init
        request_headers = CF_HEADERS.copy()
        logging.info(f"API Request: {method} {url} Params: {params} Data: {json_data}")
        # Log the start of the auth header for verification, avoiding full token log
        # logging.debug(f"Auth Header starts with: {request_headers.get('Authorization', 'N/A')[:15]}")

        response = requests.request(method, url, headers=request_headers, json=json_data, params=params, timeout=30)
        response.raise_for_status()
        logging.info(f"API Response Status: {response.status_code}")

        # Handle 204 No Content
        if response.status_code == 204 or not response.content:
            return {"success": True, "result": None}

        try:
            response_data = response.json()
            logging.debug(f"API Response Body (first 500 chars): {str(response_data)[:500]}")
            if isinstance(response_data, dict) and 'success' in response_data:
                 if response_data['success']:
                      return response_data
                 else:
                      # Extract more specific error message if available
                      cf_errors = response_data.get('errors', [])
                      if cf_errors and isinstance(cf_errors, list) and len(cf_errors) > 0 and isinstance(cf_errors[0], dict):
                           error_msg = f"API Error: {cf_errors[0].get('message', 'Unknown error')}"
                           logging.error(f"API Request Failed ({method} {url}): {error_msg} - Full Errors: {cf_errors}")
                      else:
                           error_msg = f"API reported failure but no error details provided. Response: {response_data}"
                           logging.error(f"API Request Failed ({method} {url}): {error_msg}")
                      raise requests.exceptions.RequestException(error_msg, response=response)
            else:
                 # Handle cases where response is valid JSON but not the expected Cloudflare format
                 logging.warning(f"API response for {method} {url} was valid JSON but missing 'success' field. Status: {response.status_code}. Body: {str(response_data)[:200]}")
                 # Treat as an unexpected response, raise an exception
                 raise requests.exceptions.RequestException(f"Unexpected JSON response format from API. Status: {response.status_code}", response=response)
        except json.JSONDecodeError:
            # Handle cases where response is not JSON
            logging.error(f"API response for {method} {url} was not valid JSON. Status: {response.status_code}. Body: {response.text[:200]}")
            raise requests.exceptions.RequestException(f"Invalid JSON response from API. Status: {response.status_code}", response=response)
    except requests.exceptions.RequestException as e:
        if error_msg is None: # If we didn't create a specific message above
            logging.error(f"API Request Failed: {method} {url}")
            error_msg = f"Request Exception: {e}"
            if e.response is not None:
                try:
                    # Try to get more details from the response body
                    error_data = e.response.json()
                    logging.error(f"Response Body: {error_data}")
                    cf_errors = error_data.get('errors', [])
                    if cf_errors and isinstance(cf_errors, list) and len(cf_errors) > 0 and isinstance(cf_errors[0], dict):
                        error_msg = f"API Error: {cf_errors[0].get('message', 'Unknown error')}"
                    else:
                        # Fallback if no structured errors
                        error_msg = f"HTTP {e.response.status_code} - {e.response.text[:100]}"
                except (ValueError, AttributeError, json.JSONDecodeError):
                     # If response body isn't JSON or lacks expected structure
                     error_msg = f"HTTP {e.response.status_code} - {e.response.text[:100]}"
            else:
                # Error happened before getting a response (e.g., DNS lookup failure, connection refused)
                logging.error(f"Error details (no response received): {e}")

        # Update global tunnel state error if relevant
        if "cfd_tunnel" in endpoint and tunnel_state.get("id") is None and "token" not in endpoint:
             tunnel_state["error"] = error_msg
        # Re-raise the exception with the best error message we constructed
        raise requests.exceptions.RequestException(error_msg, response=e.response)


# --- find_tunnel_via_api ---
def find_tunnel_via_api(name):
    logging.info(f"[DEBUG] Entering find_tunnel_via_api for '{name}'")
    endpoint = f"/accounts/{CF_ACCOUNT_ID}/cfd_tunnel"
    params = {"name": name, "is_deleted": "false"}
    try:
        response_data = cf_api_request("GET", endpoint, params=params)
        tunnels = response_data.get("result", [])
        if tunnels and isinstance(tunnels, list):
            tunnel = tunnels[0]
            tunnel_id = tunnel.get("id")
            if tunnel_id:
                logging.info(f"Found existing tunnel '{name}' with ID: {tunnel_id} via API.")
                token = get_tunnel_token_via_api(tunnel_id)
                logging.info(f"[DEBUG] Exiting find_tunnel_via_api for '{name}' - Found ID and got Token: {bool(token)}")
                return tunnel_id, token
            else:
                 logging.warning(f"Found tunnel entry for '{name}' but it has no ID in API response: {tunnel}")
                 logging.info(f"[DEBUG] Exiting find_tunnel_via_api for '{name}' - Found but no ID")
                 return None, None
        else:
            logging.info(f"Tunnel '{name}' not found via API.")
            logging.info(f"[DEBUG] Exiting find_tunnel_via_api for '{name}' - Not found")
            return None, None
    except requests.exceptions.RequestException as e:
        logging.error(f"API error finding tunnel '{name}': {e}")
        logging.info(f"[DEBUG] Exiting find_tunnel_via_api for '{name}' - RequestException: {e}")
        return None, None
    except Exception as e:
        logging.error(f"Unexpected error finding tunnel '{name}': {e}", exc_info=True)
        tunnel_state["error"] = f"Unexpected error finding tunnel: {e}"
        logging.info(f"[DEBUG] Exiting find_tunnel_via_api for '{name}' - Unexpected Exception: {e}")
        return None, None


# --- get_tunnel_token_via_api ---
def get_tunnel_token_via_api(tunnel_id):
    logging.info(f"[DEBUG] Entering get_tunnel_token_via_api for ID '{tunnel_id}'")
    endpoint = f"/accounts/{CF_ACCOUNT_ID}/cfd_tunnel/{tunnel_id}/token"
    url = f"{CF_API_BASE_URL}{endpoint}"
    try:
        request_headers = {"Authorization": f"Bearer {CF_API_TOKEN}"} # Ensure correct header is used
        logging.info(f"API Request: GET {url} (for token)")
        # logging.debug(f"Auth Header starts with: {request_headers.get('Authorization', 'N/A')[:15]}")
        response = requests.request("GET", url, headers=request_headers, timeout=30)
        response.raise_for_status()
        token = response.text.strip()
        if not token or len(token) < 50:
            logging.error(f"Retrieved token for tunnel {tunnel_id} appears invalid (too short or empty).")
            logging.info(f"[DEBUG] Exiting get_tunnel_token_via_api for ID '{tunnel_id}' - Invalid Token Format")
            raise ValueError("Invalid token format received from API")
        logging.info(f"Successfully retrieved token via API for tunnel {tunnel_id}")
        logging.info(f"[DEBUG] Exiting get_tunnel_token_via_api for ID '{tunnel_id}' - Success")
        return token
    except requests.exceptions.RequestException as e:
        error_msg = f"API Error getting token for tunnel {tunnel_id}: {e}"
        if e.response is not None:
             error_msg += f" Status: {e.response.status_code} Body: {e.response.text[:100]}"
        logging.error(error_msg)
        tunnel_state["error"] = error_msg
        logging.info(f"[DEBUG] Exiting get_tunnel_token_via_api for ID '{tunnel_id}' - RequestException: {e}")
        raise
    except Exception as e:
         logging.error(f"Unexpected error getting tunnel token for {tunnel_id}: {e}", exc_info=True)
         tunnel_state["error"] = f"Unexpected error getting token: {e}"
         logging.info(f"[DEBUG] Exiting get_tunnel_token_via_api for ID '{tunnel_id}' - Unexpected Exception: {e}")
         raise


# --- create_tunnel_via_api ---
def create_tunnel_via_api(name):
    logging.info(f"[DEBUG] Entering create_tunnel_via_api for '{name}'")
    endpoint = f"/accounts/{CF_ACCOUNT_ID}/cfd_tunnel"
    payload = {"name": name, "config_src": "cloudflare"}
    try:
        response_data = cf_api_request("POST", endpoint, json_data=payload)
        result = response_data.get("result", {})
        tunnel_id = result.get("id")
        token = result.get("token")
        if not tunnel_id or not token:
            logging.error(f"API response for tunnel creation missing ID or Token: {result}")
            logging.info(f"[DEBUG] Exiting create_tunnel_via_api for '{name}' - Missing ID/Token in response")
            raise ValueError("Missing ID or Token in API response for tunnel creation")
        logging.info(f"Successfully created tunnel '{name}' with ID {tunnel_id} via API.")
        logging.info(f"[DEBUG] Exiting create_tunnel_via_api for '{name}' - Success")
        return tunnel_id, token
    except requests.exceptions.RequestException as e:
        logging.error(f"API error creating tunnel '{name}': {e}")
        logging.info(f"[DEBUG] Exiting create_tunnel_via_api for '{name}' - RequestException: {e}")
        return None, None
    except Exception as e:
        logging.error(f"Unexpected error creating tunnel '{name}': {e}", exc_info=True)
        tunnel_state["error"] = f"Unexpected error creating tunnel: {e}"
        logging.info(f"[DEBUG] Exiting create_tunnel_via_api for '{name}' - Unexpected Exception: {e}")
        return None, None


# --- initialize_tunnel ---
def initialize_tunnel():
    logging.info("[DEBUG] Entering initialize_tunnel")
    tunnel_state["status_message"] = f"Checking for tunnel '{TUNNEL_NAME}' via API..."
    tunnel_state["error"] = None
    tunnel_id = None
    token = None
    try:
        logging.info("[DEBUG] Calling find_tunnel_via_api...")
        tunnel_id, token = find_tunnel_via_api(TUNNEL_NAME)
        logging.info(f"[DEBUG] find_tunnel_via_api returned: ID={tunnel_id}, Token Present={bool(token)}")

        if not tunnel_id and not tunnel_state.get("error"):
            tunnel_state["status_message"] = f"Tunnel '{TUNNEL_NAME}' not found. Creating via API..."
            logging.info("[DEBUG] Calling create_tunnel_via_api...")
            tunnel_id, token = create_tunnel_via_api(TUNNEL_NAME)
            logging.info(f"[DEBUG] create_tunnel_via_api returned: ID={tunnel_id}, Token Present={bool(token)}")

        # Final check
        if tunnel_id and token:
            tunnel_state["id"] = tunnel_id
            tunnel_state["token"] = token
            tunnel_state["status_message"] = "Tunnel setup complete (using API)."
            tunnel_state["error"] = None
            logging.info(f"Tunnel '{TUNNEL_NAME}' initialized successfully. ID: {tunnel_id}, Token retrieved.")
        elif not tunnel_state.get("error"):
             tunnel_state["status_message"] = "Tunnel initialization failed."
             tunnel_state["error"] = "Failed to find/create tunnel or retrieve token. Check logs."
             logging.error(f"Tunnel initialization failed for '{TUNNEL_NAME}'. Could not get ID and Token.")
        else:
             tunnel_state["status_message"] = "Tunnel initialization failed (see error details)."
             logging.error(f"Tunnel initialization failed for '{TUNNEL_NAME}' due to API error: {tunnel_state['error']}")
        logging.info(f"[DEBUG] Exiting initialize_tunnel - Final State: ID={tunnel_state.get('id')}, Token Present={bool(tunnel_state.get('token'))}, Error={tunnel_state.get('error')}")

    except Exception as e:
        logging.error(f"Unhandled exception during tunnel initialization: {e}", exc_info=True)
        if not tunnel_state.get("error"):
            tunnel_state["error"] = f"Initialization failed unexpectedly: {e}"
        tunnel_state["status_message"] = "Tunnel initialization failed (unexpected error)."
        logging.info(f"[DEBUG] Exiting initialize_tunnel - Unhandled Exception: {e}")


# --- get_current_cf_config ---
def get_current_cf_config():
    if not tunnel_state.get("id"):
        logging.warning("Cannot get CF config, tunnel ID not available.")
        return None # Indicate failure to get config

    endpoint = f"/accounts/{CF_ACCOUNT_ID}/cfd_tunnel/{tunnel_state['id']}/configurations"
    try:
        response_data = cf_api_request("GET", endpoint)
        # Check for success and presence of 'result' which contains the config
        if response_data and response_data.get("success"):
            result_data = response_data.get("result")
            # The result should be a dict containing 'config'
            if isinstance(result_data, dict):
                 config_data = result_data.get("config")
                 # 'config' itself should be a dict (can be empty) or null
                 if isinstance(config_data, dict):
                     logging.debug(f"Successfully fetched and parsed config: {config_data}")
                     return config_data # Return the actual config dict
                 elif config_data is None:
                     logging.info("Fetched config is null (no configuration set yet). Returning empty config.")
                     return {} # Return an empty dict representing no config
                 else:
                     logging.warning(f"Unexpected type for 'config' field in API response. Expected dict or null, got {type(config_data)}. Response: {response_data}")
                     return {} # Treat unexpected format as empty
            # Handle case where result is present but null (e.g., tunnel exists but never configured)
            elif result_data is None and response_data.get("success"):
                 logging.info("Fetched config result is null (no configuration set yet). Returning empty config.")
                 return {}
            else:
                # If 'result' key exists but isn't a dict or null
                logging.warning(f"API response success but 'result' has unexpected format or is missing. Response: {response_data}")
                return {} # Treat unexpected format as empty
        else:
            # API request failed or didn't return success
            logging.error(f"get_current_cf_config: cf_api_request did not return success or expected data. Response: {response_data}")
            return None # Indicate failure
    except requests.exceptions.RequestException as e:
        logging.error(f"API error fetching config for tunnel {tunnel_state['id']}: {e}")
        # Update global error state only if it's not already set to a more specific API error
        if not tunnel_state.get("error") or "API Error" not in tunnel_state["error"]:
             tunnel_state["error"] = f"Failed get tunnel config: {e}"
        return None # Indicate failure
    except Exception as e:
        logging.error(f"Unexpected exception in get_current_cf_config: {e}", exc_info=True)
        if not tunnel_state.get("error"): tunnel_state["error"] = f"Unexpected error getting tunnel config: {e}"
        return None


# --- find_dns_record_id ---
def find_dns_record_id(zone_id, hostname, tunnel_id):
    if not zone_id or not hostname or not tunnel_id:
        logging.error("find_dns_record_id: Missing required arguments.")
        return None

    # Construct the expected CNAME content
    expected_content = f"{tunnel_id}.cfargotunnel.com"
    endpoint = f"/zones/{zone_id}/dns_records"
    params = {
        "type": "CNAME",
        "name": hostname, # The public hostname
        "content": expected_content, # The target the CNAME should point to
        "match": "all" # Ensure all parameters match
    }
    try:
        logging.info(f"Searching for DNS record: Type=CNAME, Name={hostname}, Content={expected_content}")
        response_data = cf_api_request("GET", endpoint, params=params)
        results = response_data.get("result", [])
        if results and isinstance(results, list):
            # Found at least one matching record
            record_id = results[0].get("id")
            if record_id:
                 logging.info(f"Found DNS record for {hostname} with ID: {record_id}")
                 return record_id
            else:
                 # Log if record found but lacks ID (unlikely but possible)
                 logging.warning(f"Found matching DNS record entry for {hostname}, but it lacks an ID: {results[0]}")
                 return None
        else:
            # No matching record found
            logging.info(f"No matching DNS record found for hostname: {hostname}")
            return None
    except requests.exceptions.RequestException as e:
        logging.error(f"API error finding DNS record for {hostname}: {e}")
        return None
    except Exception as e:
        logging.error(f"Unexpected error finding DNS record for {hostname}: {e}", exc_info=True)
        return None


# --- create_cloudflare_dns_record ---
def create_cloudflare_dns_record(zone_id, hostname, tunnel_id):
    if not zone_id or not hostname or not tunnel_id:
        logging.error("create_cloudflare_dns_record: Missing required arguments.")
        return None # Return None to indicate failure

    record_name = hostname # The public FQDN
    record_content = f"{tunnel_id}.cfargotunnel.com" # The tunnel CNAME target
    endpoint = f"/zones/{zone_id}/dns_records"
    payload = {
        "type": "CNAME",
        "name": record_name,
        "content": record_content,
        "ttl": 1,  # 1 means 'Automatic' TTL
        "proxied": True # Ensure traffic goes through Cloudflare proxy
    }

    try:
        # First, check if the exact record already exists
        existing_id = find_dns_record_id(zone_id, hostname, tunnel_id)
        if existing_id:
             logging.info(f"DNS CNAME record for {hostname} pointing to {record_content} already exists (ID: {existing_id}). No action needed.")
             return existing_id # Return the existing ID

        # If not found, proceed to create it
        logging.info(f"Creating DNS CNAME record: Name={record_name}, Content={record_content}, Proxied=True")
        response_data = cf_api_request("POST", endpoint, json_data=payload)
        result = response_data.get("result", {})
        new_record_id = result.get("id")
        if new_record_id:
             logging.info(f"Successfully created DNS record for {hostname}. New ID: {new_record_id}")
             return new_record_id # Return the newly created ID
        else:
             # Log error if API reports success but no ID is returned
             logging.error(f"DNS record creation for {hostname} succeeded according to API status, but no ID was returned in result: {result}")
             return None # Indicate failure
    except requests.exceptions.RequestException as e:
        # Handle specific API errors during creation
        logging.error(f"API error creating DNS record for {hostname}: {e}")
        return None # Indicate failure
    except Exception as e:
        # Handle unexpected errors
        logging.error(f"Unexpected error creating DNS record for {hostname}: {e}", exc_info=True)
        return None # Indicate failure


# --- delete_cloudflare_dns_record ---
def delete_cloudflare_dns_record(zone_id, hostname, tunnel_id):
    if not zone_id or not hostname or not tunnel_id:
        logging.error("delete_cloudflare_dns_record: Missing required arguments.")
        return False # Return False for failure

    # First, find the specific record ID to delete
    # We need tunnel_id to ensure we delete the CNAME pointing to *our* tunnel
    dns_record_id = find_dns_record_id(zone_id, hostname, tunnel_id)

    if not dns_record_id:
        # If the record doesn't exist (or doesn't point to our tunnel), consider it success
        logging.warning(f"Could not find DNS record for {hostname} pointing to tunnel {tunnel_id} to delete. Assuming already deleted or never created.")
        return True # Return True as the desired state (no record) is achieved

    # If found, proceed with deletion
    logging.info(f"Attempting to delete DNS record for {hostname} (ID: {dns_record_id})")
    endpoint = f"/zones/{zone_id}/dns_records/{dns_record_id}"
    try:
        cf_api_request("DELETE", endpoint)
        logging.info(f"Successfully deleted DNS record for {hostname} (ID: {dns_record_id}).")
        return True # Return True for success
    except requests.exceptions.RequestException as e:
        # Handle 404 specifically - means it was already gone
        if e.response is not None and e.response.status_code == 404:
             logging.warning(f"Attempted to delete DNS record {dns_record_id} for {hostname}, but API returned 404 (already deleted?). Treating as success.")
             return True
        # Log other API errors
        logging.error(f"API error deleting DNS record {dns_record_id} for {hostname}: {e}")
        return False # Return False for failure
    except Exception as e:
        # Handle unexpected errors
        logging.error(f"Unexpected error deleting DNS record {dns_record_id} for {hostname}: {e}", exc_info=True)
        return False # Return False for failure


# --- update_cloudflare_config ---
def update_cloudflare_config():
    if not tunnel_state.get("id"):
        logging.warning("Cannot update Cloudflare config, tunnel ID not available.")
        return False

    final_ingress_rules = None
    needs_api_update = False # Flag to determine if PUT request is necessary

    # Lock state while determining desired config and comparing
    with state_lock:
        logging.info("Preparing potential Cloudflare tunnel configuration update...")
        # Build the list of desired ingress rules from current active state
        desired_ingress_rules = []
        # Define the mandatory catch-all rule
        catch_all_rule = {"service": "http_status:404"}

        for hostname, rule_details in managed_rules.items():
            # Only include rules marked as 'active'
            if rule_details.get("status") == "active":
                service = rule_details.get("service")
                if service: # Ensure service detail exists
                    desired_rule = {"hostname": hostname, "service": service}
                    # Optional: Add path filtering here if needed in the future
                    # if rule_details.get("path"):
                    #    desired_rule["path"] = rule_details["path"]
                    desired_ingress_rules.append(desired_rule)
                else:
                    logging.warning(f"Managed rule for '{hostname}' is active but missing 'service' detail. Skipping.")

        # Sort desired rules by hostname for consistent comparison and ordering
        # (Optional, but good practice)
        desired_ingress_rules.sort(key=lambda x: x.get("hostname", ""))

        # Fetch the current configuration from Cloudflare for comparison
        logging.debug("Fetching current Cloudflare config for comparison...")
        current_config = get_current_cf_config()
        if current_config is None: # Check if fetching failed
            logging.error("Failed to fetch current Cloudflare config within lock, aborting update.")
            return False # Cannot proceed without current config

        # Extract the current ingress rules from the fetched config, excluding the 404 rule
        current_cf_ingress = [rule for rule in current_config.get("ingress", [])
                              if rule.get("service") != catch_all_rule["service"]]

        # --- Comparison Logic ---
        # Convert rule lists to a comparable format (e.g., sets of tuples)
        # Ensures order doesn't matter and focuses on content.
        def rule_to_canonical(rule):
            # Include hostname and service; add path if/when implemented
            items = sorted([(k, v) for k, v in rule.items() if k in ["hostname", "service"]])
            return tuple(items)

        try:
             # Create sets of canonical rule representations
             current_cf_set = {rule_to_canonical(rule) for rule in current_cf_ingress if rule.get("hostname") and rule.get("service")}
             desired_set = {rule_to_canonical(rule) for rule in desired_ingress_rules if rule.get("hostname") and rule.get("service")}
        except Exception as e:
             # Catch potential errors during set creation (e.g., unexpected data types)
             logging.error(f"Error creating canonical rule sets for comparison: {e}", exc_info=True)
             return False

        # Compare the sets
        if current_cf_set == desired_set:
            logging.info("No changes detected between managed state and Cloudflare config. Skipping API update.")
            needs_api_update = False
        else:
            logging.info("Change detected. Desired ingress rules differ from current Cloudflare config.")
            logging.debug(f"Current CF rules (non-404, canonical): {current_cf_set}")
            logging.debug(f"Desired rules (from state, canonical): {desired_set}")
            needs_api_update = True
            # Prepare the final list: desired rules + catch-all
            final_ingress_rules = desired_ingress_rules + [catch_all_rule]
            # Optional: Add originRequest config here if needed
            # final_config_payload = {"ingress": final_ingress_rules, "originRequest": { ... }}

    # --- API Update Logic (outside the lock) ---
    if needs_api_update and final_ingress_rules is not None:
        endpoint = f"/accounts/{CF_ACCOUNT_ID}/cfd_tunnel/{tunnel_state['id']}/configurations"
        # The payload requires the full config structure
        payload = {"config": {"ingress": final_ingress_rules}}
        # Optional: If including originRequest, use final_config_payload here
        # payload = {"config": final_config_payload}

        last_exception = None # Store the last error for reporting

        # Retry Loop
        for attempt in range(MAX_CF_UPDATE_RETRIES + 1):
            try:
                logging.info(f"Attempting to push config to Cloudflare (Attempt {attempt + 1}/{MAX_CF_UPDATE_RETRIES + 1})...")
                cf_api_request("PUT", endpoint, json_data=payload)

                # Success
                logging.info("Successfully updated Cloudflare tunnel configuration via API.")
                cloudflared_agent_state["last_action_status"] = f"Cloudflare config updated successfully at {datetime.now(timezone.utc).isoformat()}"
                # Clear stale errors related to config updates from global state
                if tunnel_state.get("error") and ("Failed update tunnel config" in tunnel_state["error"] or "API Error" in tunnel_state["error"]):
                     logging.info(f"Clearing previous API error after successful update: {tunnel_state['error']}")
                     tunnel_state["error"] = None
                return True # Exit function on success

            except requests.exceptions.RequestException as e:
                last_exception = e # Store the exception
                status_code = e.response.status_code if e.response is not None else None
                logging.warning(f"Cloudflare API update attempt {attempt + 1} failed: {e} (Status Code: {status_code})")

                # Determine if the error is likely retryable
                is_retryable = False
                if isinstance(e, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)):
                    is_retryable = True # Network issues are often transient
                elif status_code in [429, 500, 502, 503, 504]: # Rate limits, server errors
                    is_retryable = True

                if is_retryable and attempt < MAX_CF_UPDATE_RETRIES:
                    # Calculate backoff delay
                    wait_time = CF_UPDATE_RETRY_DELAY * (CF_UPDATE_BACKOFF_FACTOR ** attempt)
                    # Add random jitter (e.g., +/- 20%) to avoid thundering herd
                    wait_time *= (1 + random.uniform(-0.2, 0.2))
                    wait_time = max(1, wait_time) # Ensure at least 1 sec wait

                    # Check for Retry-After header on 429s
                    if status_code == 429 and e.response is not None:
                         retry_after = e.response.headers.get("Retry-After")
                         if retry_after:
                              try:
                                  retry_after_seconds = int(retry_after)
                                  logging.info(f"Cloudflare API rate limit hit. Respecting Retry-After header: {retry_after_seconds}s")
                                  # Use the *longer* of the calculated backoff or Retry-After
                                  wait_time = max(wait_time, retry_after_seconds)
                              except ValueError:
                                  logging.warning(f"Could not parse Retry-After header value '{retry_after}'. Using calculated backoff ({wait_time:.1f}s).")

                    logging.info(f"Retrying Cloudflare update in {wait_time:.1f} seconds...")
                    # Wait for the calculated time, but allow interruption by stop_event
                    interrupted = stop_event.wait(wait_time)
                    if interrupted:
                         logging.warning("Shutdown requested during Cloudflare update retry wait. Aborting.")
                         cloudflared_agent_state["last_action_status"] = f"Error: CF update aborted during retry (shutdown)."
                         # Set global error state
                         if not tunnel_state.get("error") or "API Error" not in tunnel_state["error"]:
                              tunnel_state["error"] = f"Failed update tunnel config: aborted during retry"
                         return False # Stop retrying and signal failure
                    continue # Continue to the next retry attempt
                else:
                    # Not retryable or retries exhausted
                    logging.error(f"Cloudflare API update failed and will not be retried (Retryable: {is_retryable}, Attempt: {attempt + 1}).")
                    break # Exit the retry loop

            except Exception as e: # Catch unexpected errors during the PUT request
                 last_exception = e
                 logging.error(f"Unexpected error during Cloudflare API update attempt {attempt + 1}: {e}", exc_info=True)
                 break # Exit the retry loop

        # If loop finished without returning True, it means all attempts failed
        logging.error(f"Failed to update Cloudflare tunnel configuration after {MAX_CF_UPDATE_RETRIES + 1} attempts.")
        error_message = f"Failed update tunnel config after retries: {last_exception}"
        cloudflared_agent_state["last_action_status"] = f"Error: {error_message}"
        # Update global error state
        if not tunnel_state.get("error") or "API Error" not in tunnel_state["error"]:
             tunnel_state["error"] = error_message
        return False # Signal failure

    elif needs_api_update and final_ingress_rules is None:
         # This case should ideally not happen if logic is correct
         logging.error("Internal error: Needs API update but final_ingress_rules not set.")
         return False
    else:
         # No update was needed
         return True # Signal success (as in, no update required)


# --- process_container_start ---
def process_container_start(container):
    if not container: return
    try:
        container_id = container.id
        # Reload container info to ensure labels are fresh
        try:
             container.reload()
        except NotFound:
             # Container might have been removed very quickly after starting
             logging.warning(f"Container {container_id[:12]} not found when processing start event (likely stopped quickly).")
             return

        labels = container.labels
        container_name = container.name

        # Define the labels we look for
        enabled_label = f"{LABEL_PREFIX}.enable"
        hostname_label = f"{LABEL_PREFIX}.hostname"
        service_label = f"{LABEL_PREFIX}.service"
        # path_label = f"{LABEL_PREFIX}.path" # Example for future path support

        # Extract and validate labels
        is_enabled = labels.get(enabled_label, "false").lower() in ["true", "1", "t", "yes"]
        hostname = labels.get(hostname_label)
        service = labels.get(service_label)
        # path = labels.get(path_label) # Example for future path support

        # Check if this container should be managed
        if not is_enabled:
            logging.debug(f"Ignoring start event for container {container_name} ({container_id[:12]}): '{enabled_label}' is not 'true'.")
            return
        # Check for mandatory labels
        if not hostname or not service:
            logging.warning(f"Ignoring start event for container {container_name} ({container_id[:12]}): Missing required labels '{hostname_label}' or '{service_label}'.")
            return
        # Basic hostname validation (adjust regex if needed for specific TLDs/IDNs)
        if not re.match(r"^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+$", hostname):
             logging.warning(f"Ignoring start event for container {container_name} ({container_id[:12]}): Invalid hostname format '{hostname}'. Must be a valid FQDN.")
             return
        # Basic service validation (allow common schemes or host:port)
        # Allows http/https/tcp/unix schemes OR simple host:port (e.g., myapp:8080)
        if not (re.match(r"^(https?|tcp|unix)://", service) or re.match(r"^[a-zA-Z0-9._-]+:\d+$", service)):
             logging.warning(f"Ignoring start event for {container_name} ({container_id[:12]}): Invalid service format '{service}'. Needs scheme (http/https/tcp/unix)://... or be host_or_container_name:port.")
             return
        # Optional: Add path validation here if path label is used

        logging.info(f"Detected start for managed container: {container_name} ({container_id[:12]}) - Hostname: {hostname}, Service: {service}")
        needs_cf_update = False # Flag if Cloudflare config needs API update
        state_changed_locally = False # Flag if local state.json needs saving

        # --- State Update Logic (within lock) ---
        with state_lock:
            existing_rule = managed_rules.get(hostname)

            if existing_rule:
                # Rule for this hostname already exists
                if existing_rule.get("status") == "pending_deletion":
                    # Container restarted while rule was pending deletion - reactivate it
                    logging.info(f"Rule for {hostname} was pending deletion. Reactivating.")
                    existing_rule["status"] = "active"
                    existing_rule["delete_at"] = None
                    existing_rule["service"] = service # Update service in case it changed
                    existing_rule["container_id"] = container_id # Update container ID
                    # existing_rule["path"] = path # Update path if implemented
                    state_changed_locally = True
                    needs_cf_update = True # Need to push the reactivated rule to CF
                elif existing_rule.get("status") == "active":
                    # Rule already active, check if details changed
                    service_changed = existing_rule.get("service") != service
                    # path_changed = existing_rule.get("path") != path # Check path if implemented
                    container_changed = existing_rule.get("container_id") != container_id

                    if container_changed:
                        # Update the container ID if a different container now serves this hostname
                        logging.info(f"Updating container ID for active rule {hostname}: '{existing_rule.get('container_id', 'N/A')[:12]}' -> '{container_id[:12]}'.")
                        existing_rule["container_id"] = container_id
                        state_changed_locally = True
                        # No CF update needed just for container ID change

                    if service_changed: # Or path_changed if implemented
                         logging.info(f"Updating service for active rule {hostname}: '{existing_rule.get('service')}' -> '{service}'.")
                         existing_rule["service"] = service
                         # existing_rule["path"] = path # Update path if implemented
                         state_changed_locally = True
                         needs_cf_update = True # Service/path change requires CF config update
                    # elif path_changed: ... # Handle path change if service didn't change

                    elif not state_changed_locally:
                         # Container started, rule active, nothing changed
                         logging.info(f"Container start event for {hostname}, but rule is already active with same details.")
            else:
                # New hostname being managed
                logging.info(f"Adding new active rule for hostname: {hostname}")
                managed_rules[hostname] = {
                    "service": service,
                    "container_id": container_id,
                    "status": "active",
                    "delete_at": None
                    # "path": path # Add path if implemented
                }
                state_changed_locally = True
                needs_cf_update = True # Adding a rule requires CF update

            # Save state if any local changes were made
            if state_changed_locally:
                logging.debug(f"Local state changed for {hostname}, saving state file...")
                save_state()

        # --- Cloudflare Update Logic (outside lock) ---
        if needs_cf_update:
            logging.info(f"Triggering Cloudflare config update due to change for {hostname}.")
            # Attempt to update the Cloudflare tunnel configuration
            if update_cloudflare_config():
                logging.info(f"Tunnel config update successful for {hostname}.")
                # After successful config update, ensure DNS record exists
                if tunnel_state.get("id") and CF_ZONE_ID:
                    dns_record_id = create_cloudflare_dns_record(CF_ZONE_ID, hostname, tunnel_state["id"])
                    if dns_record_id:
                         logging.info(f"DNS record management successful for {hostname}.")
                    else:
                         # This is a potential problem state - config updated but DNS failed
                         logging.error(f"CRITICAL: Tunnel config updated for {hostname} but failed to create/verify DNS record!")
                         # Update status for UI
                         cloudflared_agent_state["last_action_status"] = f"Error: Failed creating DNS record for {hostname} after tunnel update."
                else:
                     logging.error("Missing Tunnel ID or Zone ID - cannot manage DNS record.")
            else:
                # Config update failed (retries exhausted)
                logging.error(f"Failed to update Cloudflare tunnel config after processing start for {hostname}. DNS record not managed.")
                # State was potentially saved locally, but CF is out of sync. Reconciliation should fix later.
        elif state_changed_locally:
             # Only local state changed (e.g., container ID update), no CF push needed
             logging.debug(f"Local state updated for {hostname} (e.g., container ID), no Cloudflare config change needed.")

    except NotFound:
        # Handle case where container disappears during processing
        logging.warning(f"Container {container_id[:12] if 'container_id' in locals() else 'Unknown'} not found during start processing.")
    except APIError as e:
        # Handle Docker API errors
        logging.error(f"Docker API error processing container start ({container_id[:12] if 'container_id' in locals() else 'Unknown'}): {e}", exc_info=True)
    except Exception as e:
        # Handle any other unexpected errors
        logging.error(f"Unexpected error processing container start ({container_id[:12] if 'container_id' in locals() else 'Unknown'}): {e}", exc_info=True)


# --- schedule_container_stop ---
def schedule_container_stop(container_id):
    if not container_id: return
    logging.info(f"Processing stop event for container {container_id[:12]}. Checking for managed rules.")
    hostname_to_schedule = None
    state_changed = False

    # Lock state while modifying rule status
    with state_lock:
        # Find if this container manages an *active* rule
        for hn, details in managed_rules.items():
            if details.get("container_id") == container_id and details.get("status") == "active":
                hostname_to_schedule = hn
                break # Assume one container manages one hostname for now

        if hostname_to_schedule:
            logging.info(f"Container {container_id[:12]} managed active rule for {hostname_to_schedule}. Marking for deletion.")
            rule = managed_rules[hostname_to_schedule]
            # Check if it's not already pending (e.g., multiple stop events)
            if rule.get("status") != "pending_deletion":
                 rule["status"] = "pending_deletion"
                 # Calculate deletion time based on grace period
                 rule["delete_at"] = datetime.now(timezone.utc) + timedelta(seconds=GRACE_PERIOD_SECONDS)
                 logging.info(f"Rule for {hostname_to_schedule} scheduled for deletion at {rule['delete_at'].isoformat()}")
                 state_changed = True
            else:
                 # Already pending, maybe adjust delete_at? For now, just log.
                 logging.info(f"Rule for {hostname_to_schedule} was already pending deletion.")
        else:
            # Container stopped, but it wasn't managing an active rule in our state
            logging.info(f"Stop event for container {container_id[:12]}, but it didn't manage any active rule in the current state.")

        # Save state if a rule was marked for deletion
        if state_changed:
            save_state()
    # Note: We don't update Cloudflare config here. Cleanup task handles actual removal.


# --- docker_event_listener ---
def docker_event_listener():
    if not docker_client:
        logging.error("Docker client unavailable, event listener cannot start.")
        return

    logging.info("Starting Docker event listener...")
    error_count = 0
    max_errors = 5 # Max consecutive errors before stopping

    while not stop_event.is_set() and error_count < max_errors:
        try:
            # Get events from now onwards
            # Use 'since' to avoid processing past events on reconnect
            logging.info("Connecting to Docker event stream...")
            events = docker_client.events(decode=True, since=int(time.time()))
            logging.info("Successfully connected to Docker event stream.")
            error_count = 0 # Reset error count on successful connection

            for event in events:
                if stop_event.is_set():
                    logging.info("Stop event received, exiting Docker event listener loop.")
                    break # Exit inner loop

                # Extract event details
                ev_type = event.get("Type")
                action = event.get("Action")
                actor = event.get("Actor", {})
                cont_id = actor.get("ID")

                logging.debug(f"Docker Event: Type={ev_type}, Action={action}, ActorID={cont_id[:12] if cont_id else 'N/A'}")

                # We only care about container events with an ID
                if ev_type == "container" and cont_id:
                    if action == "start":
                        try:
                            # Get container object to access labels
                            container = docker_client.containers.get(cont_id)
                            process_container_start(container)
                        except NotFound:
                            # Can happen if container is stopped/removed very quickly
                            logging.warning(f"Container {cont_id[:12]} not found shortly after 'start' event.")
                        except APIError as e:
                             logging.error(f"Docker API error getting container {cont_id[:12]} after start event: {e}")
                        except Exception as e:
                             # Catch any errors in process_container_start
                             logging.error(f"Error processing start event for {cont_id[:12]}: {e}", exc_info=True)
                    elif action in ["stop", "die", "destroy", "kill"]:
                         # Treat all these as signals that the container is no longer running
                         try:
                             schedule_container_stop(cont_id)
                         except Exception as e:
                             # Catch any errors in schedule_container_stop
                             logging.error(f"Error processing stop/die/destroy/kill event for {cont_id[:12]}: {e}", exc_info=True)

        # Handle errors related to the event stream connection
        except requests.exceptions.ConnectionError as e:
             error_count += 1
             logging.error(f"Connection error with Docker daemon in event listener: {e}. Attempting reconnect ({error_count}/{max_errors})...")
             stop_event.wait(min(30, 5 * error_count)) # Wait before retry, capped
        except APIError as e:
             # Handle API errors from the Docker daemon itself (e.g., permissions)
             error_count += 1
             logging.error(f"Docker API error in event listener stream: {e}. Attempting reconnect ({error_count}/{max_errors})...")
             stop_event.wait(min(30, 5 * error_count))
        except Exception as e:
             # Catch any other unexpected errors in the listener loop
             error_count += 1
             logging.error(f"Unexpected error in Docker event listener: {e}. Attempting reconnect ({error_count}/{max_errors})...", exc_info=True)
             stop_event.wait(min(30, 5 * error_count))

        if stop_event.is_set(): break # Exit outer loop if stopped

    # Loop exited
    if error_count >= max_errors:
         logging.error("Docker event listener stopping after multiple connection/API errors.")
    logging.info("Docker event listener stopped.")


# --- cleanup_expired_rules ---
def cleanup_expired_rules():
    logging.info("Starting cleanup task...")
    while not stop_event.is_set():
        next_check_time = time.time() + CLEANUP_INTERVAL_SECONDS
        try:
            logging.debug("Running cleanup check for expired rules...")
            hostnames_to_process_for_deletion = []
            now_utc = datetime.now(timezone.utc)
            state_changed_in_cleanup = False # Track if state file needs saving

            # --- Identify Expired Rules (within lock) ---
            with state_lock:
                for hostname, details in managed_rules.items():
                    # Only consider rules marked for deletion
                    if details.get("status") == "pending_deletion":
                        delete_at = details.get("delete_at")
                        is_expired = False
                        if isinstance(delete_at, datetime):
                             # Ensure comparison is timezone-aware (should be UTC)
                             delete_at_utc = delete_at.astimezone(timezone.utc)
                             if delete_at_utc <= now_utc:
                                 logging.info(f"Rule for {hostname} deletion grace period expired ({delete_at_utc.isoformat()}). Scheduling for full deletion.")
                                 is_expired = True
                        else:
                             # Handle invalid or missing delete_at time - delete immediately
                             logging.warning(f"Rule {hostname} is pending_deletion but delete_at is invalid or missing: {delete_at}. Scheduling for immediate full deletion.")
                             is_expired = True

                        if is_expired:
                             hostnames_to_process_for_deletion.append(hostname)

            # --- Process Deletions (outside lock) ---
            if hostnames_to_process_for_deletion:
                logging.info(f"Processing cleanup for: {hostnames_to_process_for_deletion}")
                processed_hostnames_for_cf_update = [] # Hostnames successfully processed for CF update
                dns_delete_success_all = True # Track if all DNS deletions worked

                # Step 1: Delete DNS records first
                for hostname in hostnames_to_process_for_deletion:
                    if tunnel_state.get("id") and CF_ZONE_ID:
                         logging.info(f"Attempting DNS record deletion for expired rule: {hostname}")
                         if delete_cloudflare_dns_record(CF_ZONE_ID, hostname, tunnel_state["id"]):
                              # DNS delete successful (or record didn't exist)
                              processed_hostnames_for_cf_update.append(hostname)
                         else:
                              # DNS delete failed, log error but proceed with CF update attempt
                              logging.error(f"Failed to delete DNS record for {hostname}. Tunnel config update will proceed, but DNS record may remain stale.")
                              dns_delete_success_all = False
                              # Still add to processed list so we try to remove from CF config
                              processed_hostnames_for_cf_update.append(hostname)
                    else:
                         # Cannot delete DNS if tunnel/zone ID missing
                         logging.error(f"Cannot delete DNS for {hostname}: Missing Tunnel ID or Zone ID.")
                         dns_delete_success_all = False
                         # Don't add to processed_hostnames_for_cf_update? Or add and let CF update fail?
                         # Let's add it, update_cloudflare_config will try to remove it based on state.

                # Step 2: Update Cloudflare config (implicitly removes rules not in active state)
                if processed_hostnames_for_cf_update:
                    logging.info(f"Attempting Cloudflare tunnel config update to remove rules corresponding to: {processed_hostnames_for_cf_update}")
                    # update_cloudflare_config uses the current state (where these are 'pending_deletion')
                    # to build the desired config (which won't include them)
                    if update_cloudflare_config():
                        logging.info(f"Cloudflare tunnel config updated successfully. Removing rules from local state: {processed_hostnames_for_cf_update}")
                        # Step 3: Remove from local state only after successful CF update
                        with state_lock:
                            deleted_count = 0
                            for hostname in processed_hostnames_for_cf_update:
                                # Double-check rule exists and is still pending before deleting
                                if hostname in managed_rules and managed_rules[hostname].get("status") == "pending_deletion":
                                    del managed_rules[hostname]
                                    deleted_count += 1
                                    state_changed_in_cleanup = True
                                else:
                                    # Log if rule disappeared or status changed unexpectedly
                                    logging.warning(f"Rule {hostname} was scheduled for removal but not found or no longer 'pending_deletion' when removing from state.")
                            logging.info(f"Removed {deleted_count} rules from local state.")
                            if state_changed_in_cleanup:
                                save_state()
                    else:
                        # CF update failed - log error, state remains unchanged. Will retry next cycle.
                        logging.error("Failed to update Cloudflare tunnel config during rule cleanup. Rules remain in local state (pending_deletion) and potentially in Cloudflare. Will retry on next cleanup/reconcile cycle.")
                else:
                     logging.info("No hostnames ended up being processed for deletion (e.g., DNS prerequisites failed).")

            else:
                # No rules were found to be expired in this cycle
                logging.debug("No expired rules found requiring cleanup.")

        except Exception as e:
            # Catch unexpected errors in the cleanup loop itself
            logging.error(f"Error in cleanup task loop: {e}", exc_info=True)

        # Wait until the next scheduled check time, respecting the stop event
        wait_time = max(0, next_check_time - time.time())
        stop_event.wait(wait_time)

    logging.info("Cleanup task stopped.")


# --- reconcile_state ---
def reconcile_state():
    if not docker_client:
        logging.warning("Docker client unavailable, skipping reconciliation.")
        return
    if not tunnel_state.get("id"):
        logging.warning("Tunnel not initialized (no ID), skipping reconciliation.")
        return

    logging.info("Starting state reconciliation...")
    needs_cf_update = False
    state_changed_locally = False
    try:
        # --- Get Current Docker State ---
        running_labeled_containers = {} # Dict: hostname -> {service, container_id, container_name}
        try:
             # List all running containers
             containers = docker_client.containers.list(sparse=False) # sparse=False gets more details like labels
             logging.debug(f"[Reconcile] Found {len(containers)} running containers.")
             for c in containers:
                 try:
                     # Extract labels and relevant info
                     labels = c.labels
                     container_id = c.id
                     container_name = c.name
                     enabled_label = f"{LABEL_PREFIX}.enable"
                     hostname_label = f"{LABEL_PREFIX}.hostname"
                     service_label = f"{LABEL_PREFIX}.service"
                     # path_label = f"{LABEL_PREFIX}.path" # If path support added

                     is_enabled = labels.get(enabled_label, "false").lower() in ["true", "1", "t", "yes"]
                     hostname = labels.get(hostname_label)
                     service = labels.get(service_label)
                     # path = labels.get(path_label) # If path support added

                     # Process only if enabled and has required labels + valid format
                     if is_enabled and hostname and service:
                         # Apply same validation as in process_container_start
                         if not re.match(r"^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+$", hostname): continue # Skip invalid hostname
                         if not (re.match(r"^(https?|tcp|unix)://", service) or re.match(r"^[a-zA-Z0-9._-]+:\d+$", service)): continue # Skip invalid service

                         # Handle potential duplicate hostnames (warn and take the last one found)
                         if hostname in running_labeled_containers:
                              logging.warning(f"[Reconcile] Duplicate hostname label '{hostname}' found on container {container_name} ({container_id[:12]}) and container {running_labeled_containers[hostname]['container_name']} ({running_labeled_containers[hostname]['container_id'][:12]}). Using the latest one found ({container_name}).")

                         running_labeled_containers[hostname] = {
                             "service": service,
                             "container_id": container_id,
                             "container_name": container_name
                             # "path": path # If path support added
                         }
                 except NotFound:
                      # Container disappeared between list and get details
                      logging.warning(f"[Reconcile] Container {c.id[:12]} listed but then not found during processing. Skipping.")
                      continue
                 except APIError as e:
                      # Error getting details for a specific container
                      logging.error(f"[Reconcile] Docker API error processing container {c.id[:12]}: {e}. Skipping.")
                      continue
             logging.info(f"[Reconcile] Found {len(running_labeled_containers)} running containers with valid management labels.")
        except APIError as e:
             # Error listing containers
             logging.error(f"[Reconcile] Docker API error listing containers: {e}. Aborting reconciliation.")
             return
        except requests.exceptions.ConnectionError as e:
             # Error connecting to Docker daemon
             logging.error(f"[Reconcile] Failed to connect to Docker daemon while listing containers: {e}. Aborting reconciliation.")
             return

        # --- Compare Docker State with Local State (within lock) ---
        with state_lock:
            logging.debug("[Reconcile] Acquired state lock.")
            now_utc = datetime.now(timezone.utc)
            managed_hostnames = set(managed_rules.keys())
            running_hostnames = set(running_labeled_containers.keys())
            hostnames_requiring_dns_check = [] # Track hostnames added/reactivated

            # 1. Check running containers against managed rules
            for hostname, running_details in running_labeled_containers.items():
                if hostname in managed_rules:
                    # Existing rule, check status and details
                    rule = managed_rules[hostname]
                    if rule.get("status") == "pending_deletion":
                        # Container is running, but rule was pending delete -> Reactivate
                        logging.info(f"[Reconcile] Hostname {hostname} is running but rule was pending deletion. Reactivating.")
                        rule["status"] = "active"
                        rule["delete_at"] = None
                        rule["service"] = running_details["service"] # Update details
                        rule["container_id"] = running_details["container_id"]
                        # rule["path"] = running_details["path"] # If path added
                        state_changed_locally = True
                        needs_cf_update = True
                        hostnames_requiring_dns_check.append(hostname) # Ensure DNS exists
                    elif rule.get("status") == "active":
                        # Already active, check if details match running container
                        container_changed = rule.get("container_id") != running_details["container_id"]
                        service_changed = rule.get("service") != running_details["service"]
                        # path_changed = rule.get("path") != running_details["path"] # If path added

                        if container_changed:
                            logging.info(f"[Reconcile] Updating container ID for active rule {hostname}.")
                            rule["container_id"] = running_details["container_id"]
                            state_changed_locally = True
                            # No CF update needed just for container ID change

                        if service_changed: # or path_changed:
                             logging.info(f"[Reconcile] Updating service/path for active rule {hostname}.")
                             rule["service"] = running_details["service"]
                             # rule["path"] = running_details["path"] # If path added
                             state_changed_locally = True
                             needs_cf_update = True # Service/path change needs CF push
                        # elif path_changed: ... # if only path changed

                else:
                    # Running container has labels, but no managed rule exists -> Add new rule
                    logging.info(f"[Reconcile] Found running container for {hostname} but no managed rule. Adding new active rule.")
                    managed_rules[hostname] = {
                        "service": running_details["service"],
                        "container_id": running_details["container_id"],
                        "status": "active",
                        "delete_at": None
                        # "path": running_details["path"] # If path added
                    }
                    state_changed_locally = True
                    needs_cf_update = True
                    hostnames_requiring_dns_check.append(hostname) # Ensure DNS exists

            # 2. Check managed rules against running containers
            for hostname in list(managed_hostnames): # Iterate copy as we might modify dict
                if hostname not in running_hostnames:
                     # Rule exists locally, but no container running for it
                     if hostname in managed_rules: # Double check needed if deletion happens concurrently
                         rule = managed_rules[hostname]
                         if rule.get("status") == "active":
                              # Rule is active but container is gone -> Schedule deletion
                              logging.info(f"[Reconcile] Managed rule {hostname} is active but no container found running. Scheduling deletion.")
                              rule["status"] = "pending_deletion"
                              rule["delete_at"] = now_utc + timedelta(seconds=GRACE_PERIOD_SECONDS)
                              state_changed_locally = True
                              # No CF update needed here, cleanup task handles removal later

            # 3. Compare Local State with Cloudflare State (Optional but recommended)
            # This ensures CF config matches our active rules, catching drift
            logging.debug("[Reconcile] Fetching current CF config for final comparison...")
            current_cf_config = get_current_cf_config()
            if current_cf_config is not None:
                # Extract active hostnames from CF config (excluding 404 rule)
                cf_ingress_hostnames = {r.get("hostname") for r in current_cf_config.get("ingress", [])
                                        if r.get("hostname") and r.get("service") != "http_status:404"}
                # Get active hostnames from our local state
                active_managed_hostnames = {hn for hn, d in managed_rules.items() if d.get("status") == "active"}

                # Compare the sets
                if cf_ingress_hostnames != active_managed_hostnames:
                     logging.warning(f"[Reconcile] Mismatch detected between active managed rules ({len(active_managed_hostnames)}) and Cloudflare tunnel config ({len(cf_ingress_hostnames)})!")
                     logging.info(f"[Reconcile] Active Managed State Hostnames: {sorted(list(active_managed_hostnames))}")
                     logging.info(f"[Reconcile] Found in Cloudflare Tunnel Config: {sorted(list(cf_ingress_hostnames))}")
                     logging.info("[Reconcile] Marking for Cloudflare tunnel config update to enforce local state.")
                     needs_cf_update = True # Trigger update to align CF with local state
            else:
                # Failed to get CF config, cannot perform this comparison
                logging.error("[Reconcile] Could not fetch Cloudflare config during reconciliation. Skipping final tunnel config comparison.")

            # Save state if anything changed locally
            if state_changed_locally:
                logging.info("[Reconcile] Local state changed during reconciliation. Saving state file.")
                save_state()

            logging.debug("[Reconcile] Releasing state lock.")
            # --- End Lock ---

        # --- Trigger Updates (outside lock) ---
        if needs_cf_update:
            logging.info("[Reconcile] Triggering Cloudflare tunnel config update based on reconciliation results.")
            if update_cloudflare_config():
                 # CF update successful, now check/create DNS for newly active rules
                 if hostnames_requiring_dns_check:
                      logging.info(f"[Reconcile] Checking/Creating DNS records for newly active/reactivated rules: {hostnames_requiring_dns_check}")
                      for hostname in hostnames_requiring_dns_check:
                           if tunnel_state.get("id") and CF_ZONE_ID:
                                if not create_cloudflare_dns_record(CF_ZONE_ID, hostname, tunnel_state["id"]):
                                     # Log error if DNS fails after successful CF update
                                     logging.error(f"[Reconcile] CRITICAL: Failed to ensure DNS record exists for {hostname} after successful tunnel config update.")
                           else:
                                logging.error(f"[Reconcile] Cannot check/create DNS for {hostname}: Missing Tunnel ID or Zone ID.")
            else:
                # CF update failed
                logging.error("[Reconcile] Failed to update Cloudflare tunnel config during reconciliation. DNS checks for newly active rules skipped. CF may be out of sync.")
        elif state_changed_locally:
            # Only local state changed (e.g., container ID, rule marked pending)
            logging.info("[Reconcile] Reconciliation resulted in local state changes only (no CF tunnel config update needed).")
        else:
            # No changes needed at all
            logging.info("[Reconcile] No changes required by reconciliation.")

    except Exception as e:
        # Catch any unexpected errors during the entire reconciliation process
        logging.error(f"Unexpected error during state reconciliation: {e}", exc_info=True)
    finally:
        logging.info("Reconciliation complete.")


# --- get_cloudflared_container ---
def get_cloudflared_container():
    if not docker_client:
        logging.warning("Docker client not available when trying to get cloudflared container.")
        return None
    try:
        # Try to get the container by its defined name
        container = docker_client.containers.get(CLOUDFLARED_CONTAINER_NAME)
        return container
    except NotFound:
        # Container simply doesn't exist
        logging.debug(f"Cloudflared container '{CLOUDFLARED_CONTAINER_NAME}' not found.")
        return None
    except APIError as e:
        # Error communicating with Docker API (permissions, daemon issue)
        logging.error(f"Docker API error getting container '{CLOUDFLARED_CONTAINER_NAME}': {e}")
        cloudflared_agent_state["last_action_status"] = f"Error: Docker API error getting agent: {e}"
        return None
    except requests.exceptions.ConnectionError as e:
        # Error connecting to the Docker daemon socket
        logging.error(f"Failed to connect to Docker daemon while getting container: {e}")
        cloudflared_agent_state["last_action_status"] = f"Error: Docker connection failed getting agent: {e}"
        return None
    except Exception as e:
        # Catch any other unexpected errors
        logging.error(f"Unexpected error getting container '{CLOUDFLARED_CONTAINER_NAME}': {e}", exc_info=True)
        cloudflared_agent_state["last_action_status"] = f"Error: Unexpected error getting agent: {e}"
        return None


# --- update_cloudflared_container_status ---
def update_cloudflared_container_status():
    global docker_client # Allow modification if reconnection occurs
    if not docker_client:
        logging.warning("Docker client unavailable, attempting to reconnect...")
        try:
            # Try to re-initialize the client
            docker_client = docker.from_env(timeout=5)
            docker_client.ping()
            logging.info("Successfully reconnected to Docker daemon.")
            # Reset status if it was previously unavailable
            if cloudflared_agent_state["container_status"] == "docker_unavailable":
                 cloudflared_agent_state["container_status"] = "unknown" # Re-assess status
        except Exception as e:
             # Reconnection failed
             logging.error(f"Failed to reconnect to Docker daemon: {e}")
             if cloudflared_agent_state["container_status"] != "docker_unavailable":
                 logging.warning("Setting agent status to docker_unavailable.")
                 cloudflared_agent_state["container_status"] = "docker_unavailable"
             docker_client = None # Ensure client is None if connection failed
             return # Cannot proceed without client

    # Try to get the container object
    container = get_cloudflared_container()

    if container:
        try:
            # Refresh container data from Docker daemon
            container.reload()
            new_status = container.status # e.g., 'running', 'exited', 'created'
            # Update global state only if status actually changed
            if cloudflared_agent_state["container_status"] != new_status:
                 logging.info(f"Cloudflared agent container status changed to: {new_status}")
                 cloudflared_agent_state["container_status"] = new_status
                 # Clear last action status if it becomes running
                 if new_status == 'running':
                     cloudflared_agent_state["last_action_status"] = None
        except (NotFound, APIError) as e:
            # Handle cases where container disappears or API error occurs during reload
            if cloudflared_agent_state["container_status"] != "not_found":
                 logging.warning(f"Error reloading cloudflared container status (container likely removed or API issue): {e}")
                 cloudflared_agent_state["container_status"] = "not_found" # Or maybe 'error'?
                 cloudflared_agent_state["last_action_status"] = "Agent container disappeared or API error during status check."
        except requests.exceptions.ConnectionError as e:
            # Handle connection error specifically during reload
            logging.error(f"Failed to connect to Docker daemon during status update: {e}")
            cloudflared_agent_state["container_status"] = "docker_unavailable"
            docker_client = None # Mark client as unusable
            return
    else:
        # Container object couldn't be retrieved (might be 'not_found' or due to API/connection error)
        current_status = cloudflared_agent_state.get("container_status", "unknown")
        # Update status to 'not_found' only if it wasn't already known to be unavailable/not found
        if current_status not in ["not_found", "docker_unavailable"]:
            logging.info("Cloudflared agent container not found.")
            cloudflared_agent_state["container_status"] = "not_found"


# --- ensure_docker_network_exists ---
def ensure_docker_network_exists(network_name):
    if not docker_client:
        logging.error("Docker client unavailable, cannot check/create network.")
        return False
    try:
        # Check if network already exists
        docker_client.networks.get(network_name)
        logging.info(f"Docker network '{network_name}' already exists.")
        return True
    except NotFound:
        # Network doesn't exist, try creating it
        logging.info(f"Docker network '{network_name}' not found. Creating...")
        try:
            # Create a bridge network, check_duplicate handles race condition
            docker_client.networks.create(network_name, driver="bridge", check_duplicate=True)
            logging.info(f"Successfully created Docker network '{network_name}'.")
            return True
        except APIError as e:
            # Handle specific API errors during creation
            if "already exists" in str(e): # Check if created concurrently
                 logging.warning(f"Docker network '{network_name}' already exists (created concurrently?).")
                 return True
            # Log other creation errors
            logging.error(f"Failed to create Docker network '{network_name}': {e}", exc_info=True)
            cloudflared_agent_state["last_action_status"] = f"Error creating network: {e}"
            return False
    except APIError as e:
        # Handle errors during the initial 'get' check
        logging.error(f"Error checking for Docker network '{network_name}': {e}", exc_info=True)
        cloudflared_agent_state["last_action_status"] = f"Error checking network: {e}"
        return False
    except requests.exceptions.ConnectionError as e:
        # Handle connection errors
        logging.error(f"Failed to connect to Docker daemon checking network '{network_name}': {e}")
        cloudflared_agent_state["last_action_status"] = f"Error: Docker connection failed checking network."
        return False
    except Exception as e:
        # Handle any other unexpected errors
        logging.error(f"Unexpected error checking/creating Docker network '{network_name}': {e}", exc_info=True)
        cloudflared_agent_state["last_action_status"] = f"Error: Unexpected error checking network: {e}"
        return False


# --- start_cloudflared_container ---
def start_cloudflared_container():
    logging.info(f"Attempting to start cloudflared agent container '{CLOUDFLARED_CONTAINER_NAME}'...")
    cloudflared_agent_state["last_action_status"] = "Starting..." # Update UI status
    success_flag = False # Track overall success

    try:
        # --- Prerequisites ---
        if not docker_client:
             msg = "Docker client not available."; logging.error(msg); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; return False
        if not tunnel_state.get("token"):
             msg = "Tunnel token not available."; logging.error(msg); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; return False

        # Ensure the target Docker network exists
        # Moved network check here, as it's essential before starting
        if not ensure_docker_network_exists(CLOUDFLARED_NETWORK_NAME):
             # Error message already set by ensure_docker_network_exists
             logging.error(f"Failed to ensure Docker network '{CLOUDFLARED_NETWORK_NAME}' exists. Cannot start agent.")
             return False

        token = tunnel_state["token"] # Get the tunnel token

        # --- Check Existing Container ---
        container = get_cloudflared_container()
        needs_recreate = False # Flag if existing container is misconfigured

        if container:
             try:
                 container.reload() # Refresh container state
                 logging.info(f"Found existing container '{CLOUDFLARED_CONTAINER_NAME}' with status: {container.status}")

                 # Check if already running
                 if container.status == 'running':
                      msg = f"Container '{CLOUDFLARED_CONTAINER_NAME}' is already running."; logging.info(msg); cloudflared_agent_state["last_action_status"] = msg; success_flag = True; return True # Already running, success!

                 # --- Configuration Check ---
                 # Check if container is on the correct network
                 container_networks = container.attrs.get('NetworkSettings', {}).get('Networks', {})
                 is_on_correct_network = CLOUDFLARED_NETWORK_NAME in container_networks

                 # Check if using host network mode (which we don't want)
                 network_mode = container.attrs.get('HostConfig', {}).get('NetworkMode', 'default')
                 is_host_network = network_mode == CLOUDFLARED_NETWORK_NAME or network_mode == 'host'

                 if is_host_network:
                      logging.warning(f"Existing container '{CLOUDFLARED_CONTAINER_NAME}' is in 'host' network mode or network name matches host mode identifier. Needs recreation on bridge network '{CLOUDFLARED_NETWORK_NAME}'.")
                      needs_recreate = True
                 elif not is_on_correct_network:
                      logging.warning(f"Existing container '{CLOUDFLARED_CONTAINER_NAME}' is not connected to the desired network '{CLOUDFLARED_NETWORK_NAME}'. Current networks: {list(container_networks.keys())}. Needs recreation.")
                      needs_recreate = True
                 # Optional: Add checks for command, image, restart policy if desired

                 if needs_recreate:
                      # Remove the misconfigured container
                      logging.info(f"Removing misconfigured container '{CLOUDFLARED_CONTAINER_NAME}' before creating a new one.")
                      try:
                          container.remove(force=True) # Force remove even if stopped uncleanly
                          container = None # Mark as removed
                      except (APIError, requests.exceptions.ConnectionError) as rm_err:
                           logging.error(f"Failed to remove misconfigured container: {rm_err}. Proceeding to create might fail if name conflicts.")
                           # Keep container object to potentially avoid creating a new one if remove failed badly
                 else:
                      # Existing container is correctly configured but stopped, just start it
                      logging.info(f"Starting existing correctly configured container '{CLOUDFLARED_CONTAINER_NAME}'...");
                      container.start()
                      msg = f"Started existing container '{CLOUDFLARED_CONTAINER_NAME}'."; cloudflared_agent_state["last_action_status"] = msg; logging.info(msg); success_flag = True
                      # Skip creation logic below

             except (NotFound, APIError) as e:
                  # Error checking existing container (e.g., disappeared between get and reload)
                  logging.warning(f"Error checking existing container '{CLOUDFLARED_CONTAINER_NAME}': {e}. Assuming it needs creation.")
                  container = None # Treat as if not found
             except requests.exceptions.ConnectionError as e:
                  logging.error(f"Failed to connect to Docker daemon checking existing container: {e}")
                  cloudflared_agent_state["last_action_status"] = f"Error: Docker connection failed checking agent."
                  return False

        # --- Create Container (if needed) ---
        if not container and not success_flag: # Only create if not found/removed and not already started
            logging.info(f"Container '{CLOUDFLARED_CONTAINER_NAME}' not found or needs recreation. Creating...")
            try:
                # Pull the latest image (optional, but good practice)
                try:
                    logging.info(f"Pulling image {CLOUDFLARED_IMAGE}...");
                    docker_client.images.pull(CLOUDFLARED_IMAGE)
                    logging.info(f"Successfully pulled {CLOUDFLARED_IMAGE} (or it was up-to-date).")
                except APIError as img_err:
                    # Log warning but proceed, Docker run will attempt pull anyway
                    logging.warning(f"Could not pull image {CLOUDFLARED_IMAGE}: {img_err}. Docker run will attempt to pull.")
                except requests.exceptions.ConnectionError as e:
                    # Fail fast if Docker connection lost during pull
                    logging.error(f"Failed to connect to Docker daemon during image pull: {e}")
                    cloudflared_agent_state["last_action_status"] = f"Error: Docker connection failed pulling image."
                    return False

                # Define container parameters
                container_params = {
                    "image": CLOUDFLARED_IMAGE,
                    "command": f"tunnel --no-autoupdate run --token {token}",
                    "name": CLOUDFLARED_CONTAINER_NAME,
                    "network": CLOUDFLARED_NETWORK_NAME, # Connect to our specific network
                    "restart_policy": {"Name": "unless-stopped"}, # Restart if crashes/host reboots
                    "detach": True, # Run in background
                    "remove": False, # Keep container filesystem after stop (for logs etc.)
                    "labels": {"managed-by": "cloudflare-tunnel-ingress-controller"} # Identify container
                    # Optional: Add volume mounts if needed for config files (though token is preferred)
                    # "volumes": { ... }
                }

                # Run the container
                new_container = docker_client.containers.run(**container_params)
                msg = f"Created and started container '{new_container.name}' ({new_container.id[:12]}) on network '{CLOUDFLARED_NETWORK_NAME}'."; cloudflared_agent_state["last_action_status"] = msg; logging.info(msg); success_flag = True

            except APIError as create_err:
                # Handle specific creation errors, especially name conflicts
                if "is already in use" in str(create_err):
                     logging.error(f"Container name '{CLOUDFLARED_CONTAINER_NAME}' is already in use by another container. This might happen if removal of a misconfigured container failed.")
                     # Attempt to find and log the conflicting container ID
                     try:
                          stale_container = docker_client.containers.get(CLOUDFLARED_CONTAINER_NAME)
                          logging.error(f"Conflicting container ID: {stale_container.id[:12]}")
                          msg = f"Error: Container name conflict with existing container {stale_container.id[:12]}. Please remove it manually and retry."
                     except (NotFound, APIError, requests.exceptions.ConnectionError):
                          msg = f"Error: Container name conflict, but failed to get conflicting container details. {create_err}"
                else:
                     # Other API errors during creation
                     msg = f"Docker API error creating container: {create_err}"; logging.error(msg, exc_info=True)
                cloudflared_agent_state["last_action_status"] = msg; success_flag = False
            except requests.exceptions.ConnectionError as e:
                 # Handle connection error during run
                 logging.error(f"Failed to connect to Docker daemon during container run: {e}")
                 cloudflared_agent_state["last_action_status"] = f"Error: Docker connection failed running agent."
                 success_flag = False

    # --- Catch General Errors ---
    except APIError as e:
        # Catch API errors not caught in specific blocks above
        msg = f"Docker API error during start sequence: {e}"; logging.error(msg, exc_info=True); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; success_flag = False
    except requests.exceptions.ConnectionError as e:
        # Catch connection errors not caught above
        msg = f"Failed to connect to Docker daemon during start sequence: {e}"; logging.error(msg); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; success_flag = False
    except Exception as e:
        # Catch any other unexpected errors
        msg = f"Unexpected error starting container: {e}"; logging.error(msg, exc_info=True); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; success_flag = False

    # --- Final Status Update ---
    finally:
        if docker_client:
             logging.debug("Updating container status after start attempt...")
             # Give agent a moment to potentially start/stabilize before checking status
             time.sleep(2)
             update_cloudflared_container_status()
        logging.info(f"Exiting start_cloudflared_container function (Success: {success_flag}).")
        return success_flag


# --- stop_cloudflared_container ---
def stop_cloudflared_container():
    logging.info(f"Attempting to stop cloudflared agent container '{CLOUDFLARED_CONTAINER_NAME}'...")
    cloudflared_agent_state["last_action_status"] = "Stopping..."
    success_flag = False
    try:
        # Check Docker client
        if not docker_client:
            msg = "Docker client not available."; logging.error(msg); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; return False

        # Get the container
        container = get_cloudflared_container()
        if not container:
            # Already stopped or never existed
            msg = f"Container '{CLOUDFLARED_CONTAINER_NAME}' not found, cannot stop (already stopped?)."; logging.warning(msg); cloudflared_agent_state["last_action_status"] = msg; success_flag = True; return True

        # Reload to get current status
        container.reload()
        if container.status != 'running':
             # Not running, nothing to do
             msg = f"Container '{CLOUDFLARED_CONTAINER_NAME}' is not running (status: {container.status}). No action needed."; logging.info(msg); cloudflared_agent_state["last_action_status"] = msg; success_flag = True; return True

        # Stop the running container
        logging.info(f"Stopping running container '{CLOUDFLARED_CONTAINER_NAME}'...");
        container.stop(timeout=30) # Give it 30 seconds to stop gracefully
        msg = f"Successfully stopped container '{CLOUDFLARED_CONTAINER_NAME}'."; cloudflared_agent_state["last_action_status"] = msg; logging.info(msg); success_flag = True

    except (APIError, NotFound) as e: # Catch API errors or if container disappears during stop
        msg = f"Docker API error stopping container: {e}"; logging.error(msg, exc_info=True); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; success_flag = False
    except requests.exceptions.ConnectionError as e:
        msg = f"Failed to connect to Docker daemon stopping container: {e}"; logging.error(msg); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; success_flag = False
    except Exception as e:
        msg = f"Unexpected error stopping container: {e}"; logging.error(msg, exc_info=True); cloudflared_agent_state["last_action_status"] = f"Error: {msg}"; success_flag = False
    finally:
        # Update status after attempting stop
        if docker_client:
             logging.debug("Updating container status after stop attempt..."); time.sleep(2); update_cloudflared_container_status()
        logging.info(f"Exiting stop_cloudflared_container function (Success: {success_flag}).")
        return success_flag

# --- Flask App Setup ---
app = Flask(__name__) # Flask will automatically look for templates in a 'templates' folder
app.secret_key = os.urandom(24) # Needed for flash messages or sessions if used later


# --- get_display_token ---
def get_display_token(token):
    """Masks the token for display purposes."""
    if not token: return "Not available"
    # Show first 5 and last 5 characters
    return f"{token[:5]}...{token[-5:]}" if len(token) > 10 else "Token retrieved (short)"


# --- status_page ---
@app.route('/')
def status_page():
    # Always update status before rendering
    update_cloudflared_container_status()

    # Prepare data for the template, ensuring thread safety
    with state_lock:
        # Create a deep copy or carefully structure data for the template
        # Pass datetime objects directly for Jinja formatting
        rules_for_template = {}
        for hn, rule in managed_rules.items():
            # Pass the rule dictionary; Jinja handles datetime formatting
            rules_for_template[hn] = rule.copy()

        template_tunnel_state = tunnel_state.copy()
        template_agent_state = cloudflared_agent_state.copy()

    display_token = get_display_token(template_tunnel_state.get("token"))
    docker_available = docker_client is not None

    # Use render_template to load and render the HTML file
    return render_template('status_page.html',  # Points to templates/status_page.html
                            tunnel_state=template_tunnel_state,
                            agent_state=template_agent_state,
                            display_token=display_token,
                            cloudflared_container_name=CLOUDFLARED_CONTAINER_NAME,
                            docker_available=docker_available,
                            rules=rules_for_template) # Pass the prepared rules


# --- start_tunnel ---
@app.route('/start', methods=['POST'])
def start_tunnel():
    logging.info("Received request to start tunnel agent via UI.")
    start_cloudflared_container()
    # Optional: Add flash message here for feedback
    time.sleep(1) # Give status update a moment
    return redirect(url_for('status_page'))


# --- stop_tunnel ---
@app.route('/stop', methods=['POST'])
def stop_tunnel():
    logging.info("Received request to stop tunnel agent via UI.")
    stop_cloudflared_container()
    # Optional: Add flash message here for feedback
    time.sleep(1) # Give status update a moment
    return redirect(url_for('status_page'))


# --- force_delete_rule ---
@app.route('/force_delete/<hostname>', methods=['POST'])
def force_delete_rule(hostname):
    logging.info(f"Received request to force delete rule for hostname: {hostname}")
    rule_removed_from_state = False
    dns_delete_success = False

    # Step 1: Delete DNS record immediately
    if tunnel_state.get("id") and CF_ZONE_ID:
        logging.info(f"Attempting DNS record deletion for force-deleted rule: {hostname}")
        dns_delete_success = delete_cloudflare_dns_record(CF_ZONE_ID, hostname, tunnel_state["id"])
        if not dns_delete_success:
             # Log error but continue, as user requested force delete
             logging.error(f"Failed to delete DNS record for {hostname} during force delete. Tunnel config update will proceed, but DNS record may remain stale.")
             # Update UI status (can be improved with flash messages)
             cloudflared_agent_state["last_action_status"] = f"Warning: Failed deleting DNS record for {hostname}. Tunnel update proceeding."
    else:
        logging.error(f"Cannot delete DNS for {hostname}: Missing Tunnel ID or Zone ID.")
        cloudflared_agent_state["last_action_status"] = f"Error: Cannot delete DNS for {hostname} (missing config)."
        # Proceed with state removal, but DNS couldn't be touched

    # Step 2: Remove rule from local state
    with state_lock:
        if hostname in managed_rules:
            logging.info(f"Force deleting rule for {hostname} from local state.")
            del managed_rules[hostname]
            rule_removed_from_state = True
            save_state() # Save state immediately after removal
        else:
            # Rule might have been deleted by cleanup task already
            logging.warning(f"Attempted force delete for hostname '{hostname}', but it was not found in managed rules (perhaps already deleted or cleaned up).")
            # Treat as success in terms of state removal
            rule_removed_from_state = True

    # Step 3: Trigger Cloudflare config update to remove the rule
    if rule_removed_from_state: # Only update CF if state was actually changed or rule confirmed gone
        logging.info(f"Triggering Cloudflare tunnel config update after force deleting {hostname} (or confirming removal).")
        if update_cloudflare_config():
            logging.info(f"Cloudflare tunnel config update successful after force deleting {hostname}.")
            # Set final status message based on DNS success
            if dns_delete_success:
                 cloudflared_agent_state["last_action_status"] = f"Successfully force deleted rule and DNS record for {hostname} and updated Cloudflare."
            else:
                 # DNS failed earlier, but state and CF updated
                 cloudflared_agent_state["last_action_status"] = f"Force deleted rule for {hostname} (DNS delete failed/skipped earlier, but tunnel config updated)."
        else:
            # This is a bad state: State removed, DNS maybe removed, CF update FAILED
            logging.error(f"CRITICAL: State saved after force delete of {hostname}, DNS delete status: {dns_delete_success}, but subsequent Cloudflare tunnel config update FAILED!")
            cloudflared_agent_state["last_action_status"] = f"Error: Removed {hostname} locally, DNS delete status: {dns_delete_success}, but FAILED pushing tunnel config update! Reconciliation needed."

    time.sleep(1) # Allow UI status to potentially update
    return redirect(url_for('status_page'))


# --- run_background_tasks ---
def run_background_tasks():
    """Starts the Docker event listener and cleanup threads."""
    if not docker_client:
        logging.warning("Docker client not available. Background tasks (event listener, cleanup) will not start.")
        return None, None
    if not tunnel_state.get("id"):
         logging.warning("Tunnel not initialized. Background tasks (event listener, cleanup) will not start.")
         return None, None

    logging.info("Starting background threads for Docker events and rule cleanup.")
    event_thread = threading.Thread(target=docker_event_listener, name="DockerEventListener", daemon=True)
    cleanup_thread = threading.Thread(target=cleanup_expired_rules, name="CleanupTask", daemon=True)

    event_thread.start()
    cleanup_thread.start()
    logging.info("Background threads started.")
    return event_thread, cleanup_thread


# --- Main Execution ---
if __name__ == '__main__':
    logging.info("----------------------------------------------------")
    logging.info("--- Cloudflare Tunnel Ingress Manager Starting ---")
    logging.info("----------------------------------------------------")

    # Load initial state from file
    load_state()
    logging.info("Initial state loading complete.")
    event_thread = None
    cleanup_thread = None

    # --- Critical Pre-checks ---
    if not CF_ZONE_ID:
        logging.error("FATAL: CF_ZONE_ID environment variable is missing. DNS management will fail.")
        # Update state for UI feedback before exiting
        tunnel_state["status_message"] = "Error: CF_ZONE_ID missing."
        tunnel_state["error"] = "CF_ZONE_ID environment variable must be set."
        # Render a minimal error page or just exit? Exiting seems appropriate.
        sys.exit(1) # Exit if critical config missing

    if not docker_client:
         # Docker client failed to connect at startup
         logging.error("Docker client is unavailable at startup. Limited functionality.")
         tunnel_state["status_message"] = "Error: Docker client unavailable."
         tunnel_state["error"] = "Failed to connect to Docker daemon. Check socket mount and permissions."
         cloudflared_agent_state["container_status"] = "docker_unavailable"
         logging.warning("Skipping tunnel initialization, reconciliation, agent management, and background tasks due to Docker connection failure.")
         # Continue to run Flask to show the error state? Yes.
    else:
         # --- Normal Startup Flow ---
         logging.info("Docker client available.")

         # Ensure network exists early - Temporarily Commented Out
         logging.info(f"Ensuring Docker network '{CLOUDFLARED_NETWORK_NAME}' exists... (Check deferred)")
         # ensure_docker_network_exists(CLOUDFLARED_NETWORK_NAME) # <-- Temporarily commented out

         # Initialize Cloudflare Tunnel (find or create, get token)
         # ADDED DEBUG LOGGING HERE (Keep this line for now)
         logging.info("[DEBUG] >>> About to call initialize_tunnel()...")
         initialize_tunnel()
         logging.info(f"Tunnel initialization process complete. Status: {tunnel_state.get('status_message')}")
         logging.debug(f"Tunnel State after init: ID={tunnel_state.get('id')}, Token Present={bool(tunnel_state.get('token'))}, Error={tunnel_state.get('error')}")

         # Proceed only if tunnel setup was successful
         if tunnel_state.get("id") and tunnel_state.get("token"):
             logging.info("Tunnel initialized successfully. Proceeding with reconciliation and agent checks.")

             # Run initial reconciliation to sync state
             reconcile_state()
             logging.info("Initial state reconciliation complete.")

             # Check agent status and start if needed
             logging.info("Checking and attempting to automatically start tunnel agent container if needed...")
             update_cloudflared_container_status() # Get current status
             if cloudflared_agent_state.get("container_status") != 'running':
                 logging.info("Agent container not running, attempting auto-start...")
                 start_cloudflared_container() # Try to start it
             else:
                 logging.info("Agent container already running.")

             # Start background tasks (event listener, cleanup)
             event_thread, cleanup_thread = run_background_tasks()
         else:
             # Tunnel setup failed, log warning and skip dependent steps
             logging.warning("Tunnel not fully initialized (missing ID or Token). Skipping reconciliation, agent start, and background tasks.")
             # Ensure status message reflects this if no specific error was set
             if not tunnel_state.get("error"):
                 tunnel_state["status_message"] = "Tunnel setup incomplete (ID/Token missing)."

    # --- Start Web Server ---
    logging.info("Starting Flask application web server...")
    flask_thread = None
    try:
        # Use Waitress for a more production-ready server than Flask's default
        from waitress import serve
        # Run Waitress in a separate thread so main thread can monitor
        flask_thread = threading.Thread(
            target=serve,
            args=(app,),
            kwargs={'host':'0.0.0.0','port':5000},
            daemon=True, # Allow main thread to exit even if this thread is running
            name="FlaskWaitressServer"
        )
        flask_thread.start()
        logging.info("Flask server started using waitress on 0.0.0.0:5000 in a background thread.")

        # Keep main thread alive to monitor background tasks and handle shutdown
        while True:
             all_threads_alive = True
             if flask_thread and not flask_thread.is_alive():
                  logging.error("Flask server thread terminated unexpectedly.")
                  all_threads_alive = False
             # Check background tasks only if they were expected to start
             if event_thread and not event_thread.is_alive():
                  logging.warning("Docker event listener thread terminated unexpectedly.")
                  # Optionally restart? For now, just log.
             if cleanup_thread and not cleanup_thread.is_alive():
                  logging.warning("Cleanup thread terminated unexpectedly.")
                  # Optionally restart? For now, just log.

             if not all_threads_alive:
                  logging.error("A critical thread terminated. Initiating shutdown.")
                  stop_event.set() # Signal other threads to stop
                  break # Exit main loop

             if stop_event.is_set(): # Check if shutdown was initiated elsewhere
                  logging.info("Stop event detected in main loop.")
                  break

             time.sleep(10) # Check thread status periodically

    except ImportError:
        logging.warning("Waitress not found. Falling back to Flask development server (use 'pip install waitress' for production).")
        # Run Flask's built-in server directly (blocks main thread)
        app.run(host='0.0.0.0', port=5000) # Note: Not suitable for production
    except KeyboardInterrupt:
         logging.info("KeyboardInterrupt received.")
    except Exception as server_err:
        logging.error(f"Web server encountered a fatal error: {server_err}", exc_info=True)
    finally:
        # --- Shutdown Sequence ---
        logging.info("Shutdown sequence initiated...")
        stop_event.set() # Signal background threads to stop gracefully
        logging.info("Stop event set for background threads.")

        # Optional: Wait briefly for threads to exit?
        # if event_thread: event_thread.join(timeout=5)
        # if cleanup_thread: cleanup_thread.join(timeout=5)

        logging.info("Exiting Cloudflare Tunnel Ingress Manager application.")
        # Determine exit code based on final state
        exit_code = 0
        if tunnel_state.get("error") or cloudflared_agent_state.get("container_status") == "docker_unavailable":
             exit_code = 1 # Exit with error if critical issues exist
        sys.exit(exit_code)