mirror of
https://github.com/ruvnet/RuView.git
synced 2026-04-26 13:10:40 +00:00
fix(firmware): defensive node_id capture prevents runtime clobber (#390)
Users on multi-node ESP32 deployments have been reporting for months
that their provisioned `node_id` reverts to the Kconfig default of `1`
in UDP frames and the `csi_collector` init log, despite boot showing:
nvs_config: NVS override: node_id=4
main: ESP32-S3 CSI Node (ADR-018) - Node ID: 4
csi_collector: CSI collection initialized (node_id=1, channel=11)
See #232, #375, #385, #386, #390. The root memory-corruption path for
the `g_nvs_config.node_id` byte has not been definitively isolated
(does not reproduce on my attached ESP32-S3 running current source
and the v0.6.0 release binary), but the UDP frame header can be made
tamper-proof regardless:
1. `csi_collector_init()` now captures `g_nvs_config.node_id` into a
module-local `static uint8_t s_node_id` at init time.
2. `csi_serialize_frame()` reads `buf[4]` from `s_node_id`, not from
the global - so any later corruption of `g_nvs_config` cannot
affect outgoing CSI frames.
3. All other consumers (`edge_processing.c` x3, `wasm_runtime.c`,
`display_ui.c`, `main.c swarm_bridge_init`) now go through a new
`csi_collector_get_node_id()` accessor instead of reading the
global directly.
4. A canary at end-of-init logs `WARN` if `g_nvs_config.node_id`
already diverges from the captured value - this will pinpoint
the corruption path if it happens on a user's device.
Hardware validation on attached ESP32-S3 (COM8):
- NVS loads node_id=2
- Boot log: `main: ... Node ID: 2`
- NEW log: `csi_collector: Captured node_id=2 at init (defensive
copy for #232/#375/#385/#390)`
- Init log: `csi_collector: CSI collection initialized (node_id=2)`
- UDP frame byte[4] = 2 (verified via socket sniffer, 15/15 packets)
This is defense in depth - it shields the UDP frame from whatever
upstream bug is clobbering the struct. When a user hits the original
bug, the canary WARN will help isolate the root cause.
Refs #232 #375 #385 #386 #390
Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
6e015c4626
commit
425f0e6aac
7 changed files with 58 additions and 10 deletions
|
|
@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
### Fixed
|
||||
- **`provision.py` esptool v5 compat** (#391) — Stale `write_flash` (underscore) syntax in the dry-run manual-flash hint now uses `write-flash` (hyphenated) for esptool >= 5.x. The primary flash command was already correct.
|
||||
- **`provision.py` silent NVS wipe** (#391) — The script replaces the entire `csi_cfg` NVS namespace on every run, so partial invocations were silently erasing WiFi credentials and causing `Retrying WiFi connection (10/10)` in the field. Now refuses to run without `--ssid`, `--password`, and `--target-ip` unless `--force-partial` is passed. `--force-partial` prints a warning listing which keys will be wiped.
|
||||
- **Firmware: defensive `node_id` capture** (#232, #375, #385, #386, #390) — Users on multi-node deployments reported `node_id` reverting to the Kconfig default (`1`) in UDP frames and in the `csi_collector` init log, despite NVS loading the correct value. The root cause (memory corruption of `g_nvs_config`) has not been definitively isolated, but the UDP frame header is now tamper-proof: `csi_collector_init()` captures `g_nvs_config.node_id` into a module-local `s_node_id` once, and `csi_serialize_frame()` plus all other consumers (`edge_processing.c`, `wasm_runtime.c`, `display_ui.c`, `swarm_bridge_init`) read it via the new `csi_collector_get_node_id()` accessor. A canary logs `WARN` if `g_nvs_config.node_id` diverges from `s_node_id` at end-of-init, helping isolate the upstream corruption path. Validated on attached ESP32-S3 (COM8): NVS `node_id=2` propagates through boot log, capture log, init log, and byte[4] of every UDP frame.
|
||||
|
||||
### Docs
|
||||
- **CHANGELOG catch-up** (#367) — Added missing entries for v0.5.5, v0.6.0, and v0.7.0 releases.
|
||||
|
|
|
|||
|
|
@ -25,6 +25,14 @@
|
|||
/* ADR-060: Access the global NVS config for MAC filter and channel override. */
|
||||
extern nvs_config_t g_nvs_config;
|
||||
|
||||
/* Defensive fix (#232, #375, #385, #386, #390): capture node_id at init-time
|
||||
* into a module-local static. Using the global g_nvs_config.node_id directly
|
||||
* at every callback is vulnerable to any memory corruption that clobbers the
|
||||
* struct (which users have reported reverting node_id to the Kconfig default
|
||||
* of 1). The local copy is set once at csi_collector_init() and then used
|
||||
* exclusively by csi_serialize_frame(). */
|
||||
static uint8_t s_node_id = 1;
|
||||
|
||||
/* ADR-057: Build-time guard — fail early if CSI is not enabled in sdkconfig.
|
||||
* Without this, the firmware compiles but crashes at runtime with:
|
||||
* "E (xxxx) wifi:CSI not enabled in menuconfig!"
|
||||
|
|
@ -117,8 +125,9 @@ size_t csi_serialize_frame(const wifi_csi_info_t *info, uint8_t *buf, size_t buf
|
|||
uint32_t magic = CSI_MAGIC;
|
||||
memcpy(&buf[0], &magic, 4);
|
||||
|
||||
/* Node ID (from NVS runtime config, not compile-time Kconfig) */
|
||||
buf[4] = g_nvs_config.node_id;
|
||||
/* Node ID (captured at init into s_node_id to survive memory corruption
|
||||
* that could clobber g_nvs_config.node_id - see #232/#375/#385/#390). */
|
||||
buf[4] = s_node_id;
|
||||
|
||||
/* Number of antennas */
|
||||
buf[5] = n_antennas;
|
||||
|
|
@ -215,6 +224,13 @@ static void wifi_promiscuous_cb(void *buf, wifi_promiscuous_pkt_type_t type)
|
|||
|
||||
void csi_collector_init(void)
|
||||
{
|
||||
/* Capture node_id into module-local static at init time. After this point
|
||||
* csi_serialize_frame() uses s_node_id exclusively, isolating the UDP
|
||||
* frame node_id field from any memory corruption of g_nvs_config. */
|
||||
s_node_id = g_nvs_config.node_id;
|
||||
ESP_LOGI(TAG, "Captured node_id=%u at init (defensive copy for #232/#375/#385/#390)",
|
||||
(unsigned)s_node_id);
|
||||
|
||||
/* ADR-060: Determine the CSI channel.
|
||||
* Priority: 1) NVS override (--channel), 2) connected AP channel, 3) Kconfig default. */
|
||||
uint8_t csi_channel = (uint8_t)CONFIG_CSI_WIFI_CHANNEL;
|
||||
|
|
@ -272,8 +288,24 @@ void csi_collector_init(void)
|
|||
g_nvs_config.filter_mac[4], g_nvs_config.filter_mac[5]);
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "CSI collection initialized (node_id=%d, channel=%u)",
|
||||
g_nvs_config.node_id, (unsigned)csi_channel);
|
||||
ESP_LOGI(TAG, "CSI collection initialized (node_id=%u, channel=%u)",
|
||||
(unsigned)s_node_id, (unsigned)csi_channel);
|
||||
|
||||
/* Clobber-detection canary: if g_nvs_config.node_id no longer matches the
|
||||
* value we captured, something corrupted the struct between nvs_config_load
|
||||
* and here. This is the historic #232/#375 symptom. */
|
||||
if (g_nvs_config.node_id != s_node_id) {
|
||||
ESP_LOGW(TAG, "node_id clobber detected: captured=%u but g_nvs_config=%u "
|
||||
"(frames will use captured value %u). Please report to #390.",
|
||||
(unsigned)s_node_id, (unsigned)g_nvs_config.node_id,
|
||||
(unsigned)s_node_id);
|
||||
}
|
||||
}
|
||||
|
||||
/* Accessor for other modules that need the authoritative runtime node_id. */
|
||||
uint8_t csi_collector_get_node_id(void)
|
||||
{
|
||||
return s_node_id;
|
||||
}
|
||||
|
||||
/* ---- ADR-029: Channel hopping ---- */
|
||||
|
|
|
|||
|
|
@ -29,6 +29,18 @@
|
|||
*/
|
||||
void csi_collector_init(void);
|
||||
|
||||
/**
|
||||
* Get the runtime node_id captured at csi_collector_init().
|
||||
*
|
||||
* This is a defensive copy of g_nvs_config.node_id taken at init time. Other
|
||||
* modules (edge_processing, wasm_runtime, display_ui) should prefer this
|
||||
* accessor over reading g_nvs_config.node_id directly, because the global
|
||||
* struct can be clobbered by memory corruption (see #232, #375, #385, #390).
|
||||
*
|
||||
* @return Node ID (0-255) as loaded from NVS or Kconfig default at boot.
|
||||
*/
|
||||
uint8_t csi_collector_get_node_id(void);
|
||||
|
||||
/**
|
||||
* Serialize CSI data into ADR-018 binary frame format.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#include "display_ui.h"
|
||||
#include "nvs_config.h"
|
||||
#include "csi_collector.h" /* csi_collector_get_node_id() - defensive #390 */
|
||||
#include "sdkconfig.h"
|
||||
|
||||
extern nvs_config_t g_nvs_config;
|
||||
|
|
@ -350,7 +351,7 @@ void display_ui_update(void)
|
|||
{
|
||||
char buf[48];
|
||||
|
||||
snprintf(buf, sizeof(buf), "Node: %d", g_nvs_config.node_id);
|
||||
snprintf(buf, sizeof(buf), "Node: %u", (unsigned)csi_collector_get_node_id());
|
||||
lv_label_set_text(s_sys_node, buf);
|
||||
|
||||
snprintf(buf, sizeof(buf), "Heap: %lu KB free",
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include "edge_processing.h"
|
||||
#include "nvs_config.h"
|
||||
#include "csi_collector.h" /* csi_collector_get_node_id() - defensive #390 */
|
||||
#include "mmwave_sensor.h"
|
||||
|
||||
/* Runtime config — declared in main.c, loaded from NVS at boot. */
|
||||
|
|
@ -441,7 +442,7 @@ static void send_compressed_frame(const uint8_t *iq_data, uint16_t iq_len,
|
|||
uint32_t magic = EDGE_COMPRESSED_MAGIC;
|
||||
memcpy(&pkt[0], &magic, 4);
|
||||
|
||||
pkt[4] = g_nvs_config.node_id;
|
||||
pkt[4] = csi_collector_get_node_id(); /* #390: defensive copy */
|
||||
pkt[5] = channel;
|
||||
memcpy(&pkt[6], &iq_len, 2);
|
||||
memcpy(&pkt[8], &comp_len, 2);
|
||||
|
|
@ -557,7 +558,7 @@ static void send_vitals_packet(void)
|
|||
memset(&pkt, 0, sizeof(pkt));
|
||||
|
||||
pkt.magic = EDGE_VITALS_MAGIC;
|
||||
pkt.node_id = g_nvs_config.node_id;
|
||||
pkt.node_id = csi_collector_get_node_id(); /* #390: defensive copy */
|
||||
|
||||
pkt.flags = 0;
|
||||
if (s_presence_detected) pkt.flags |= 0x01;
|
||||
|
|
@ -647,7 +648,7 @@ static void send_feature_vector(void)
|
|||
memset(&pkt, 0, sizeof(pkt));
|
||||
|
||||
pkt.magic = EDGE_FEATURE_MAGIC;
|
||||
pkt.node_id = g_nvs_config.node_id;
|
||||
pkt.node_id = csi_collector_get_node_id(); /* #390: defensive copy */
|
||||
pkt.reserved = 0;
|
||||
pkt.seq = s_feature_seq++;
|
||||
pkt.timestamp_us = esp_timer_get_time();
|
||||
|
|
|
|||
|
|
@ -267,7 +267,7 @@ void app_main(void)
|
|||
strncpy(swarm_cfg.seed_url, g_nvs_config.seed_url, sizeof(swarm_cfg.seed_url) - 1);
|
||||
strncpy(swarm_cfg.seed_token, g_nvs_config.seed_token, sizeof(swarm_cfg.seed_token) - 1);
|
||||
strncpy(swarm_cfg.zone_name, g_nvs_config.zone_name, sizeof(swarm_cfg.zone_name) - 1);
|
||||
swarm_ret = swarm_bridge_init(&swarm_cfg, g_nvs_config.node_id);
|
||||
swarm_ret = swarm_bridge_init(&swarm_cfg, csi_collector_get_node_id());
|
||||
if (swarm_ret != ESP_OK) {
|
||||
ESP_LOGW(TAG, "Swarm bridge init failed: %s", esp_err_to_name(swarm_ret));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
#include "sdkconfig.h"
|
||||
#include "wasm_runtime.h"
|
||||
#include "nvs_config.h"
|
||||
#include "csi_collector.h" /* csi_collector_get_node_id() - defensive #390 */
|
||||
|
||||
extern nvs_config_t g_nvs_config;
|
||||
|
||||
|
|
@ -383,7 +384,7 @@ static void send_wasm_output(uint8_t slot_id)
|
|||
memset(&pkt, 0, sizeof(pkt));
|
||||
|
||||
pkt.magic = WASM_OUTPUT_MAGIC;
|
||||
pkt.node_id = g_nvs_config.node_id;
|
||||
pkt.node_id = csi_collector_get_node_id(); /* #390: defensive copy */
|
||||
pkt.module_id = slot_id;
|
||||
pkt.event_count = n_filtered;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue