From 751ebd17a58a8a513994509214373bb9e6a3d66c Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 28 May 2026 20:59:14 +0200 Subject: [PATCH] mtmd-debug: add color and rainbow mode (#23829) * mtmd-debug: add color and rainbow mode * fix M_PI * max_dist --- tools/mtmd/debug/mtmd-debug.cpp | 61 +++++++++++++++++++++++++++++++++ tools/mtmd/debug/mtmd-debug.md | 37 ++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/tools/mtmd/debug/mtmd-debug.cpp b/tools/mtmd/debug/mtmd-debug.cpp index f19ca4cfe..b88a16f0f 100644 --- a/tools/mtmd/debug/mtmd-debug.cpp +++ b/tools/mtmd/debug/mtmd-debug.cpp @@ -30,7 +30,9 @@ static void show_additional_info(int /*argc*/, char ** argv) { " -p \"encode\" (debugging encode pass, default case):\n" " --image can be:\n" " \"white\", \"black\", \"gray\": filled 1.0f, 0.0f and 0.5f respectively\n" + " \"red\", \"green\", \"blue\": filled with respective colors\n" " \"cb\": checkerboard pattern, alternate 1.0f and 0.0f\n" + " \"rainbow\": raspberry-pi-like rainbow pattern\n" " --audio can be:\n" " \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n" " \"1010\": checkerboard pattern, alternate 1.0f and 0.0f\n" @@ -144,6 +146,65 @@ int main(int argc, char ** argv) { image[y][x * 3 + 2] = v; } } + } else if (input == "red") { + for (int i = 0; i < inp_size; ++i) { + auto row = std::vector(inp_size * 3, 0.0f); + for (int j = 0; j < inp_size; ++j) { + row[j * 3 + 0] = 1.0f; // R channel + } + image.push_back(row); + } + } else if (input == "green") { + for (int i = 0; i < inp_size; ++i) { + auto row = std::vector(inp_size * 3, 0.0f); + for (int j = 0; j < inp_size; ++j) { + row[j * 3 + 1] = 1.0f; // G channel + } + image.push_back(row); + } + } else if (input == "blue") { + for (int i = 0; i < inp_size; ++i) { + auto row = std::vector(inp_size * 3, 0.0f); + for (int j = 0; j < inp_size; ++j) { + row[j * 3 + 2] = 1.0f; // B channel + } + image.push_back(row); + } + } else if (input == "rainbow") { + for (int i = 0; i < inp_size; ++i) { + image.push_back(std::vector(inp_size * 3, 0.0f)); + } + float cx = inp_size / 2.0f; + float cy = inp_size / 2.0f; + float max_dist = std::sqrt(cx * cx + cy * cy); + for (int y = 0; y < inp_size; ++y) { + for (int x = 0; x < inp_size; ++x) { + float dx = x - cx; + float dy = y - cy; + float hue = std::atan2(dy, dx) / (2.0f * 3.14159265f); + if (hue < 0) hue += 1.0f; + float sat = std::sqrt(dx * dx + dy * dy) / max_dist; + if (sat > 1.0f) sat = 1.0f; + float h6 = hue * 6.0f; + int i6 = (int)h6; + float f = h6 - i6; + float p = 1.0f - sat; + float q = 1.0f - sat * f; + float t = 1.0f - sat * (1.0f - f); + float r, g, b; + switch (i6 % 6) { + case 0: r=1; g=t; b=p; break; + case 1: r=q; g=1; b=p; break; + case 2: r=p; g=1; b=t; break; + case 3: r=p; g=q; b=1; break; + case 4: r=t; g=p; b=1; break; + default: r=1; g=p; b=q; break; + } + image[y][x * 3 + 0] = r; + image[y][x * 3 + 1] = g; + image[y][x * 3 + 2] = b; + } + } } else if (input == "one") { samples = std::vector(inp_size, 1.0f); } else if (input == "zero") { diff --git a/tools/mtmd/debug/mtmd-debug.md b/tools/mtmd/debug/mtmd-debug.md index 76ffe5c84..71bd52dd4 100644 --- a/tools/mtmd/debug/mtmd-debug.md +++ b/tools/mtmd/debug/mtmd-debug.md @@ -20,6 +20,43 @@ def test_vision(): test_vision() ``` +Example of debugging a rainbow image: + +```py +import torch +import math + +def make_rainbow(img_size): + cx, cy = img_size / 2.0, img_size / 2.0 + max_dist = math.sqrt(cx * cx + cy * cy) + img = torch.zeros(1, 3, img_size, img_size) + for y in range(img_size): + for x in range(img_size): + dx, dy = x - cx, y - cy + hue = math.atan2(dy, dx) / (2 * math.pi) + if hue < 0: + hue += 1 + sat = math.sqrt(dx * dx + dy * dy) / max_dist + sat = min(sat, 1.0) + h6 = hue * 6 + i6 = int(h6) + f = h6 - i6 + p = 1 - sat + q = 1 - sat * f + t = 1 - sat * (1 - f) + rgb = [(1,t,p),(q,1,p),(p,1,t),(p,q,1),(t,p,1),(1,p,q)][i6 % 6] + img[0, 0, y, x] = rgb[0] + img[0, 1, y, x] = rgb[1] + img[0, 2, y, x] = rgb[2] + return img + +img_size = 896 +pixel_values = make_rainbow(img_size) +with torch.no_grad(): + outputs = model.model.get_image_features(pixel_values=pixel_values) +print("last_hidden_state:", outputs.last_hidden_state) +``` + ## Debugging preprocess pass (TODO)