mtmd-debug: add color and rainbow mode (#23829)

* mtmd-debug: add color and rainbow mode * fix M_PI * max_dist
2026-05-30 20:33:39 +00:00 · 2026-05-28 20:59:14 +02:00 · 2026-05-28 20:59:14 +02:00 · 751ebd17a5
commit 751ebd17a5
parent c8914ad4f4
2 changed files with 98 additions and 0 deletions
--- a/tools/mtmd/debug/mtmd-debug.cpp
+++ b/tools/mtmd/debug/mtmd-debug.cpp
@ -30,7 +30,9 @@ static void show_additional_info(int /*argc*/, char ** argv) {
        "    -p \"encode\" (debugging encode pass, default case):\n"
        "        --image can be:\n"
        "          \"white\", \"black\", \"gray\": filled 1.0f, 0.0f and 0.5f respectively\n"
+        "          \"red\", \"green\", \"blue\": filled with respective colors\n"
        "          \"cb\": checkerboard pattern, alternate 1.0f and 0.0f\n"
+        "          \"rainbow\": raspberry-pi-like rainbow pattern\n"
        "        --audio can be:\n"
        "          \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n"
        "          \"1010\": checkerboard pattern, alternate 1.0f and 0.0f\n"
@ -144,6 +146,65 @@ int main(int argc, char ** argv) {
                    image[y][x * 3 + 2] = v;
                }
            }
+        } else if (input == "red") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 0] = 1.0f;  // R channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "green") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 1] = 1.0f;  // G channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "blue") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 2] = 1.0f;  // B channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "rainbow") {
+            for (int i = 0; i < inp_size; ++i) {
+                image.push_back(std::vector<float>(inp_size * 3, 0.0f));
+            }
+            float cx = inp_size / 2.0f;
+            float cy = inp_size / 2.0f;
+            float max_dist = std::sqrt(cx * cx + cy * cy);
+            for (int y = 0; y < inp_size; ++y) {
+                for (int x = 0; x < inp_size; ++x) {
+                    float dx = x - cx;
+                    float dy = y - cy;
+                    float hue = std::atan2(dy, dx) / (2.0f * 3.14159265f);
+                    if (hue < 0) hue += 1.0f;
+                    float sat = std::sqrt(dx * dx + dy * dy) / max_dist;
+                    if (sat > 1.0f) sat = 1.0f;
+                    float h6 = hue * 6.0f;
+                    int i6 = (int)h6;
+                    float f = h6 - i6;
+                    float p = 1.0f - sat;
+                    float q = 1.0f - sat * f;
+                    float t = 1.0f - sat * (1.0f - f);
+                    float r, g, b;
+                    switch (i6 % 6) {
+                        case 0: r=1; g=t; b=p; break;
+                        case 1: r=q; g=1; b=p; break;
+                        case 2: r=p; g=1; b=t; break;
+                        case 3: r=p; g=q; b=1; break;
+                        case 4: r=t; g=p; b=1; break;
+                        default: r=1; g=p; b=q; break;
+                    }
+                    image[y][x * 3 + 0] = r;
+                    image[y][x * 3 + 1] = g;
+                    image[y][x * 3 + 2] = b;
+                }
+            }
        } else if (input == "one") {
            samples = std::vector<float>(inp_size, 1.0f);
        } else if (input == "zero") {
--- a/tools/mtmd/debug/mtmd-debug.md
+++ b/tools/mtmd/debug/mtmd-debug.md
@ -20,6 +20,43 @@ def test_vision():
 test_vision()
 ```

+Example of debugging a rainbow image:
+
+```py
+import torch
+import math
+
+def make_rainbow(img_size):
+    cx, cy = img_size / 2.0, img_size / 2.0
+    max_dist = math.sqrt(cx * cx + cy * cy)
+    img = torch.zeros(1, 3, img_size, img_size)
+    for y in range(img_size):
+        for x in range(img_size):
+            dx, dy = x - cx, y - cy
+            hue = math.atan2(dy, dx) / (2 * math.pi)
+            if hue < 0:
+                hue += 1
+            sat = math.sqrt(dx * dx + dy * dy) / max_dist
+            sat = min(sat, 1.0)
+            h6 = hue * 6
+            i6 = int(h6)
+            f = h6 - i6
+            p = 1 - sat
+            q = 1 - sat * f
+            t = 1 - sat * (1 - f)
+            rgb = [(1,t,p),(q,1,p),(p,1,t),(p,q,1),(t,p,1),(1,p,q)][i6 % 6]
+            img[0, 0, y, x] = rgb[0]
+            img[0, 1, y, x] = rgb[1]
+            img[0, 2, y, x] = rgb[2]
+    return img
+
+img_size = 896
+pixel_values = make_rainbow(img_size)
+with torch.no_grad():
+    outputs = model.model.get_image_features(pixel_values=pixel_values)
+print("last_hidden_state:", outputs.last_hidden_state)
+```
+
 ## Debugging preprocess pass

 (TODO)