diff --git a/kt-kernel/README.md b/kt-kernel/README.md index 3eb3c7dd..b1bf0456 100644 --- a/kt-kernel/README.md +++ b/kt-kernel/README.md @@ -31,16 +31,38 @@ git submodule update --init --recursive ### Quick Installation (Recommended) -The installation script automatically detects your CPU and configures optimal build settings: +Step 0: Create and activate a conda environment (recommended): ```bash -# Simple one-command installation (auto-detects CPU) -./install.sh +conda create -n kt-kernel python=3.11 -y +conda activate kt-kernel ``` -The installation script will: +You can now install in two clear steps using the same script. + +Option A: Two-step (explicit) + +```bash +# 1) Install system prerequisites (cmake, hwloc, pkg-config) +./install.sh deps + +# 2) Build and install kt-kernel (auto-detects CPU) +# By default, the script cleans the local ./build directory before compiling. +./install.sh build +``` + +Option B: One-step (deps + build) + +```bash +# Simple one-command installation +./install.sh # same as: ./install.sh all +# Skip deps step if you already installed them +./install.sh all --skip-deps +``` + +The install script will: - Auto-detect CPU capabilities (AMX support) -- Install `cmake` via conda (for the latest version) +- Install `cmake` via conda (if available) - Install system dependencies (`libhwloc-dev`, `pkg-config`) based on your OS **What gets configured automatically:** @@ -58,8 +80,8 @@ If you need specific build options (e.g., for LLAMAFILE backend, compatibility, export CPUINFER_CPU_INSTRUCT=AVX512 # Options: NATIVE, AVX512, AVX2 export CPUINFER_ENABLE_AMX=OFF # Options: ON, OFF -# Run with manual mode -./install.sh --manual +# Run with manual mode (build only) +./install.sh build --manual ``` For advanced build options and binary distribution, see the [Build Configuration](#build-configuration) section. If you encounter issues, refer to [Error Troubleshooting](#error-troubleshooting). diff --git a/kt-kernel/cpu_backend/shared_mem_buffer.cpp b/kt-kernel/cpu_backend/shared_mem_buffer.cpp index 173243be..c6b04d00 100644 --- a/kt-kernel/cpu_backend/shared_mem_buffer.cpp +++ b/kt-kernel/cpu_backend/shared_mem_buffer.cpp @@ -12,6 +12,7 @@ #include #include +#include size_t MemoryRequest::total_size() { size_t total = 0; @@ -53,12 +54,15 @@ void SharedMemBuffer::alloc(void* object, MemoryRequest requests) { if (buffer) { free(buffer); } - buffer = std::aligned_alloc(64, total_size); - if (!buffer) { - printf("cannot aligned alloc %ld bytes\n", total_size); - perror("aligned_alloc"); // errno == ENOMEM/EINVAL + void* newbuf = nullptr; + int rc = posix_memalign(&newbuf, 64, total_size); + if (rc != 0 || !newbuf) { + errno = rc; // posix_memalign returns error code instead of setting errno + printf("cannot aligned alloc %zu bytes (align=%d)\n", (size_t)total_size, 64); + perror("posix_memalign"); // ENOMEM/EINVAL exit(1); } + buffer = newbuf; size = total_size; for (auto& req : object_requests) { req.update_base_ptr(buffer); diff --git a/kt-kernel/examples/repro_llamafile_re.py b/kt-kernel/examples/repro_llamafile_re.py new file mode 100644 index 00000000..bbaa9c72 --- /dev/null +++ b/kt-kernel/examples/repro_llamafile_re.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +""" +Minimal LLAMAFILE repro harness to catch intermittent RuntimeError/RE. + +Requirements: +- kt_kernel_ext built with LLAMAFILE (and CUDA stream integration) +- Valid GGUF weights directory (WEIGHT_PATH) + +Usage: + WEIGHT_PATH=/path/to/gguf python examples/repro_llamafile_re.py + +Optional env: + DEVICE=cuda|cpu # default: auto (cuda if available) + N_ITERS=1000 # iterations + BATCH=4 # batch size + H=2048 # hidden size + EXPERTS=128 # total experts + TOPK=8 # experts per token + INTER=768 # intermediate size (must be divisible by 256) + GPU_EXPERTS=100 # num experts on GPU side + TP=2 # threadpool_count + CPU_THREADS=32 # cpuinfer_threads + MAX_DEFER=2 # max_deferred_experts_per_token + MODE=split|forward # split=submit+sync, forward=wrapper.forward + SEED=1 # random seed + +Debug tips: + - Set CUDA_LAUNCH_BLOCKING=1 to catch async errors deterministically. + - Try varying N_ITERS, BATCH, TOPK, MAX_DEFER. + - Capture stdout/stderr for failure iteration index. +""" + +from __future__ import annotations + +import os +import sys +import faulthandler +import torch + +from kt_kernel import KTMoEWrapper + + +def getenv_int(name: str, default: int) -> int: + try: + return int(os.environ.get(name, default)) + except Exception: + return default + + +def get_stream_for(device: torch.device | str): + device = torch.device(device) + if device.type == "cuda" and torch.cuda.is_available(): + return torch.cuda.current_stream(device).cuda_stream + return 0 + + +def main() -> int: + faulthandler.enable() + + weight_path = (os.environ.get("WEIGHT_PATH") or "").strip() + if not weight_path: + print("ERROR: WEIGHT_PATH env is required.") + return 2 + if not os.path.exists(weight_path): + print(f"ERROR: WEIGHT_PATH does not exist: {weight_path}") + return 2 + + device_str = os.environ.get("DEVICE") or ("cuda" if torch.cuda.is_available() else "cpu") + device = torch.device(device_str) + + n_iters = getenv_int("N_ITERS", 1000) + batch = getenv_int("BATCH", 4) + hidden = getenv_int("H", 2048) + experts = getenv_int("EXPERTS", 128) + topk = getenv_int("TOPK", 8) + inter = getenv_int("INTER", 768) + gpu_experts = getenv_int("GPU_EXPERTS", 100) + tp = getenv_int("TP", 2) + cpu_threads = getenv_int("CPU_THREADS", 32) + max_defer = getenv_int("MAX_DEFER", 2) + seed = getenv_int("SEED", 1) + mode = (os.environ.get("MODE") or "split").lower() + + if inter % 256 != 0: + print(f"ERROR: INTER must be divisible by 256 for LLAMAFILE (got {inter}).") + return 2 + + print( + f"LLAMAFILE Repro: device={device}, iters={n_iters}, batch={batch}, H={hidden}, topk={topk}, E={experts}, inter={inter}, TP={tp}, CPU_THREADS={cpu_threads}, mode={mode}" + ) + print(f"Weights: {weight_path}") + + torch.manual_seed(seed) + + # Create wrapper and load weights once + wrapper = KTMoEWrapper( + layer_idx=0, + num_experts=experts, + num_experts_per_tok=topk, + hidden_size=hidden, + moe_intermediate_size=inter, + num_gpu_experts=gpu_experts, + cpuinfer_threads=cpu_threads, + threadpool_count=tp, + weight_path=weight_path, + chunked_prefill_size=512, + method="LLAMAFILE", + max_deferred_experts_per_token=max_defer, + ) + wrapper.load_weights() + + # Optional capture of small batch sizes + KTMoEWrapper.set_capture_batch_sizes([1, 2, 4, 8, 16]) + + stream = get_stream_for(device) + + # Allocate once and reuse to reduce allocator noise + hidden_states = torch.empty(batch, hidden, dtype=torch.bfloat16, device=device) + topk_ids = torch.empty(batch, topk, dtype=torch.long, device=device) + topk_weights = torch.empty(batch, topk, dtype=torch.float32, device=device) + + def fill_random(): + hidden_states.normal_(mean=0.0, std=1.0) + topk_ids.random_(0, experts) + topk_weights.uniform_() + topk_weights.div_(topk_weights.sum(dim=-1, keepdim=True) + 1e-6) + + # Warmup + fill_random() + _ = wrapper.forward(hidden_states, topk_ids, topk_weights, stream) + if device.type == "cuda": + torch.cuda.synchronize(device) + + # Main loop + for i in range(n_iters): + try: + fill_random() + if mode == "forward": + _ = wrapper.forward(hidden_states, topk_ids, topk_weights, stream) + else: + wrapper.submit_forward(hidden_states, topk_ids, topk_weights, stream) + # Optional small GPU op to put work on the same stream + if device.type == "cuda": + hidden_states.add_(0) # no-op but enqueued on current stream + _ = wrapper.sync_forward(hidden_states, stream) + + if (i + 1) % 50 == 0: + print(f"ok: iter {i + 1}/{n_iters}") + if device.type == "cuda": + torch.cuda.synchronize(device) + + except Exception as e: + print(f"FAIL at iter {i}: {repr(e)}") + # Flush GPU work for better diagnostics + if device.type == "cuda": + try: + torch.cuda.synchronize(device) + except Exception as _: + pass + return 1 + + print("All iterations completed without error.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/kt-kernel/install.sh b/kt-kernel/install.sh index 297aa55d..2ee30bff 100755 --- a/kt-kernel/install.sh +++ b/kt-kernel/install.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -set -e +set -euo pipefail install_dependencies() { echo "Checking and installing system dependencies..." @@ -65,14 +65,21 @@ install_dependencies usage() { cat < /dev/null; then + SUDO="sudo" + else + echo "Warning: Not running as root and sudo not found. Package installation may fail." + echo "Please run as root or install sudo." + fi + fi + + if command -v conda &> /dev/null; then + echo "Installing cmake via conda..." + conda install -y cmake + else + echo "Warning: conda not found. Skipping cmake installation via conda." + echo "Please install conda or manually install cmake." + fi + + # Detect OS type + if [ -f /etc/os-release ]; then + . /etc/os-release + OS=$ID + elif [ -f /etc/debian_version ]; then + OS="debian" + elif [ -f /etc/redhat-release ]; then + OS="rhel" + else + echo "Warning: Unable to detect OS type. Skipping dependency installation." + return 0 + fi + + # Install dependencies based on OS + case "$OS" in + debian|ubuntu|linuxmint|pop) + echo "Detected Debian-based system. Installing libhwloc-dev and pkg-config..." + $SUDO apt update + $SUDO apt install -y libhwloc-dev pkg-config + ;; + fedora|rhel|centos|rocky|almalinux) + echo "Detected Red Hat-based system. Installing hwloc-devel and pkgconfig..." + $SUDO dnf install -y hwloc-devel pkgconfig || $SUDO yum install -y hwloc-devel pkgconfig + ;; + arch|manjaro) + echo "Detected Arch-based system. Installing hwloc and pkgconf..." + $SUDO pacman -S --noconfirm hwloc pkgconf + ;; + opensuse*|sles) + echo "Detected openSUSE-based system. Installing hwloc-devel and pkg-config..." + $SUDO zypper install -y hwloc-devel pkg-config + ;; + *) + echo "Warning: Unsupported OS '$OS'. Please manually install libhwloc-dev and pkg-config." + ;; + esac +} + # Function to detect CPU features detect_cpu_features() { local has_amx=0 @@ -132,18 +199,33 @@ detect_cpu_features() { echo "$has_amx" } -# Check if user requested help -if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then - usage -fi +build_step() { + # Parse build-only flags from arguments to this function + local MANUAL_MODE=0 + local CLEAN_BUILD=1 + while [[ $# -gt 0 ]]; do + case "$1" in + --manual) MANUAL_MODE=1; shift ;; + --skip-deps) shift ;; # ignore here + --no-clean) CLEAN_BUILD=0; shift ;; + -h|--help) usage ;; + *) break ;; + esac + done -# Check if manual mode -MANUAL_MODE=0 -if [ "$1" = "--manual" ]; then - MANUAL_MODE=1 -fi + # Clean local build directory to ensure a fresh CMake/configure + local REPO_ROOT + REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + if [[ "$CLEAN_BUILD" -eq 1 ]]; then + if [[ -d "$REPO_ROOT/build" ]]; then + echo "Cleaning previous build directory: $REPO_ROOT/build" + rm -rf "$REPO_ROOT/build" + fi + else + echo "Skipping clean of $REPO_ROOT/build (requested by --no-clean)" + fi -if [ "$MANUAL_MODE" = "0" ]; then + if [ "$MANUAL_MODE" = "0" ]; then # Auto-detection mode echo "==========================================" echo "Auto-detecting CPU capabilities..." @@ -172,7 +254,7 @@ if [ "$MANUAL_MODE" = "0" ]; then echo "" echo "To use manual configuration instead, run: $0 --manual" echo "" -else + else # Manual mode - validate user configuration (no exports) if [ -z "$CPUINFER_CPU_INSTRUCT" ] || [ -z "$CPUINFER_ENABLE_AMX" ]; then echo "Error: Manual mode requires CPUINFER_CPU_INSTRUCT and CPUINFER_ENABLE_AMX to be set." @@ -216,7 +298,9 @@ else fi fi fi -fi + +# Close MANUAL_MODE conditional + fi # Set defaults for optional variables export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release} @@ -232,9 +316,31 @@ echo " CPUINFER_VERBOSE=$CPUINFER_VERBOSE" echo "" pip install . -v +} +# Subcommand dispatcher: default to "all" +SUBCMD="all" +if [[ $# -gt 0 ]]; then + case "$1" in + deps|build|all) SUBCMD="$1"; shift ;; + -h|--help) usage ;; + *) SUBCMD="build" ;; # backward compatibility: flags-only => build + esac +fi -echo "Successfully built and installed kt-kernel! with configuration:" -echo " CPUINFER_CPU_INSTRUCT=$CPUINFER_CPU_INSTRUCT" -echo " CPUINFER_ENABLE_AMX=$CPUINFER_ENABLE_AMX" -echo " CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE" \ No newline at end of file +case "$SUBCMD" in + deps) + install_dependencies + ;; + build) + build_step "$@" + ;; + all) + if [[ " ${*:-} " == *" --skip-deps "* ]]; then + build_step "$@" + else + install_dependencies + build_step "$@" + fi + ;; +esac