diff --git a/kt-kernel/README.md b/kt-kernel/README.md
index 3eb3c7dd..b1bf0456 100644
--- a/kt-kernel/README.md
+++ b/kt-kernel/README.md
@@ -31,16 +31,38 @@ git submodule update --init --recursive
 
 ### Quick Installation (Recommended)
 
-The installation script automatically detects your CPU and configures optimal build settings:
+Step 0: Create and activate a conda environment (recommended):
 
 ```bash
-# Simple one-command installation (auto-detects CPU)
-./install.sh
+conda create -n kt-kernel python=3.11 -y
+conda activate kt-kernel
 ```
 
-The installation script will:
+You can now install in two clear steps using the same script.
+
+Option A: Two-step (explicit)
+
+```bash
+# 1) Install system prerequisites (cmake, hwloc, pkg-config)
+./install.sh deps
+
+# 2) Build and install kt-kernel (auto-detects CPU)
+#    By default, the script cleans the local ./build directory before compiling.
+./install.sh build
+```
+
+Option B: One-step (deps + build)
+
+```bash
+# Simple one-command installation
+./install.sh            # same as: ./install.sh all
+# Skip deps step if you already installed them
+./install.sh all --skip-deps
+```
+
+The install script will:
 - Auto-detect CPU capabilities (AMX support)
-- Install `cmake` via conda (for the latest version)
+- Install `cmake` via conda (if available)
 - Install system dependencies (`libhwloc-dev`, `pkg-config`) based on your OS
 
 **What gets configured automatically:**
@@ -58,8 +80,8 @@ If you need specific build options (e.g., for LLAMAFILE backend, compatibility,
 export CPUINFER_CPU_INSTRUCT=AVX512  # Options: NATIVE, AVX512, AVX2
 export CPUINFER_ENABLE_AMX=OFF       # Options: ON, OFF
 
-# Run with manual mode
-./install.sh --manual
+# Run with manual mode (build only)
+./install.sh build --manual
 ```
 
 For advanced build options and binary distribution, see the [Build Configuration](#build-configuration) section. If you encounter issues, refer to [Error Troubleshooting](#error-troubleshooting).
diff --git a/kt-kernel/cpu_backend/shared_mem_buffer.cpp b/kt-kernel/cpu_backend/shared_mem_buffer.cpp
index 173243be..c6b04d00 100644
--- a/kt-kernel/cpu_backend/shared_mem_buffer.cpp
+++ b/kt-kernel/cpu_backend/shared_mem_buffer.cpp
@@ -12,6 +12,7 @@
 #include <numa.h>
 
 #include <cstdio>
+#include <errno.h>
 
 size_t MemoryRequest::total_size() {
   size_t total = 0;
@@ -53,12 +54,15 @@ void SharedMemBuffer::alloc(void* object, MemoryRequest requests) {
     if (buffer) {
       free(buffer);
     }
-    buffer = std::aligned_alloc(64, total_size);
-    if (!buffer) {
-      printf("cannot aligned alloc %ld bytes\n", total_size);
-      perror("aligned_alloc");  // errno == ENOMEM/EINVAL
+    void* newbuf = nullptr;
+    int rc = posix_memalign(&newbuf, 64, total_size);
+    if (rc != 0 || !newbuf) {
+      errno = rc;  // posix_memalign returns error code instead of setting errno
+      printf("cannot aligned alloc %zu bytes (align=%d)\n", (size_t)total_size, 64);
+      perror("posix_memalign");  // ENOMEM/EINVAL
       exit(1);
     }
+    buffer = newbuf;
     size = total_size;
     for (auto& req : object_requests) {
       req.update_base_ptr(buffer);
diff --git a/kt-kernel/examples/repro_llamafile_re.py b/kt-kernel/examples/repro_llamafile_re.py
new file mode 100644
index 00000000..bbaa9c72
--- /dev/null
+++ b/kt-kernel/examples/repro_llamafile_re.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+"""
+Minimal LLAMAFILE repro harness to catch intermittent RuntimeError/RE.
+
+Requirements:
+- kt_kernel_ext built with LLAMAFILE (and CUDA stream integration)
+- Valid GGUF weights directory (WEIGHT_PATH)
+
+Usage:
+  WEIGHT_PATH=/path/to/gguf python examples/repro_llamafile_re.py
+
+Optional env:
+  DEVICE=cuda|cpu           # default: auto (cuda if available)
+  N_ITERS=1000              # iterations
+  BATCH=4                   # batch size
+  H=2048                    # hidden size
+  EXPERTS=128               # total experts
+  TOPK=8                    # experts per token
+  INTER=768                 # intermediate size (must be divisible by 256)
+  GPU_EXPERTS=100           # num experts on GPU side
+  TP=2                      # threadpool_count
+  CPU_THREADS=32            # cpuinfer_threads
+  MAX_DEFER=2               # max_deferred_experts_per_token
+  MODE=split|forward        # split=submit+sync, forward=wrapper.forward
+  SEED=1                    # random seed
+
+Debug tips:
+  - Set CUDA_LAUNCH_BLOCKING=1 to catch async errors deterministically.
+  - Try varying N_ITERS, BATCH, TOPK, MAX_DEFER.
+  - Capture stdout/stderr for failure iteration index.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import faulthandler
+import torch
+
+from kt_kernel import KTMoEWrapper
+
+
+def getenv_int(name: str, default: int) -> int:
+    try:
+        return int(os.environ.get(name, default))
+    except Exception:
+        return default
+
+
+def get_stream_for(device: torch.device | str):
+    device = torch.device(device)
+    if device.type == "cuda" and torch.cuda.is_available():
+        return torch.cuda.current_stream(device).cuda_stream
+    return 0
+
+
+def main() -> int:
+    faulthandler.enable()
+
+    weight_path = (os.environ.get("WEIGHT_PATH") or "").strip()
+    if not weight_path:
+        print("ERROR: WEIGHT_PATH env is required.")
+        return 2
+    if not os.path.exists(weight_path):
+        print(f"ERROR: WEIGHT_PATH does not exist: {weight_path}")
+        return 2
+
+    device_str = os.environ.get("DEVICE") or ("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device(device_str)
+
+    n_iters = getenv_int("N_ITERS", 1000)
+    batch = getenv_int("BATCH", 4)
+    hidden = getenv_int("H", 2048)
+    experts = getenv_int("EXPERTS", 128)
+    topk = getenv_int("TOPK", 8)
+    inter = getenv_int("INTER", 768)
+    gpu_experts = getenv_int("GPU_EXPERTS", 100)
+    tp = getenv_int("TP", 2)
+    cpu_threads = getenv_int("CPU_THREADS", 32)
+    max_defer = getenv_int("MAX_DEFER", 2)
+    seed = getenv_int("SEED", 1)
+    mode = (os.environ.get("MODE") or "split").lower()
+
+    if inter % 256 != 0:
+        print(f"ERROR: INTER must be divisible by 256 for LLAMAFILE (got {inter}).")
+        return 2
+
+    print(
+        f"LLAMAFILE Repro: device={device}, iters={n_iters}, batch={batch}, H={hidden}, topk={topk}, E={experts}, inter={inter}, TP={tp}, CPU_THREADS={cpu_threads}, mode={mode}"
+    )
+    print(f"Weights: {weight_path}")
+
+    torch.manual_seed(seed)
+
+    # Create wrapper and load weights once
+    wrapper = KTMoEWrapper(
+        layer_idx=0,
+        num_experts=experts,
+        num_experts_per_tok=topk,
+        hidden_size=hidden,
+        moe_intermediate_size=inter,
+        num_gpu_experts=gpu_experts,
+        cpuinfer_threads=cpu_threads,
+        threadpool_count=tp,
+        weight_path=weight_path,
+        chunked_prefill_size=512,
+        method="LLAMAFILE",
+        max_deferred_experts_per_token=max_defer,
+    )
+    wrapper.load_weights()
+
+    # Optional capture of small batch sizes
+    KTMoEWrapper.set_capture_batch_sizes([1, 2, 4, 8, 16])
+
+    stream = get_stream_for(device)
+
+    # Allocate once and reuse to reduce allocator noise
+    hidden_states = torch.empty(batch, hidden, dtype=torch.bfloat16, device=device)
+    topk_ids = torch.empty(batch, topk, dtype=torch.long, device=device)
+    topk_weights = torch.empty(batch, topk, dtype=torch.float32, device=device)
+
+    def fill_random():
+        hidden_states.normal_(mean=0.0, std=1.0)
+        topk_ids.random_(0, experts)
+        topk_weights.uniform_()
+        topk_weights.div_(topk_weights.sum(dim=-1, keepdim=True) + 1e-6)
+
+    # Warmup
+    fill_random()
+    _ = wrapper.forward(hidden_states, topk_ids, topk_weights, stream)
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+
+    # Main loop
+    for i in range(n_iters):
+        try:
+            fill_random()
+            if mode == "forward":
+                _ = wrapper.forward(hidden_states, topk_ids, topk_weights, stream)
+            else:
+                wrapper.submit_forward(hidden_states, topk_ids, topk_weights, stream)
+                # Optional small GPU op to put work on the same stream
+                if device.type == "cuda":
+                    hidden_states.add_(0)  # no-op but enqueued on current stream
+                _ = wrapper.sync_forward(hidden_states, stream)
+
+            if (i + 1) % 50 == 0:
+                print(f"ok: iter {i + 1}/{n_iters}")
+                if device.type == "cuda":
+                    torch.cuda.synchronize(device)
+
+        except Exception as e:
+            print(f"FAIL at iter {i}: {repr(e)}")
+            # Flush GPU work for better diagnostics
+            if device.type == "cuda":
+                try:
+                    torch.cuda.synchronize(device)
+                except Exception as _:
+                    pass
+            return 1
+
+    print("All iterations completed without error.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+
diff --git a/kt-kernel/install.sh b/kt-kernel/install.sh
index 297aa55d..2ee30bff 100755
--- a/kt-kernel/install.sh
+++ b/kt-kernel/install.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-set -e
+set -euo pipefail
 
 install_dependencies() {
   echo "Checking and installing system dependencies..."
@@ -65,14 +65,21 @@ install_dependencies
 
 usage() {
   cat <<EOF
-Usage: $0 [OPTIONS]
+Usage: $0 [SUBCOMMAND] [BUILD_OPTIONS]
 
-This script builds kt-kernel with optimal settings for your CPU.
+Two-step installation in one file. Choose a subcommand:
 
-OPTIONS:
-  (none)          Auto-detect CPU and configure automatically (recommended)
+SUBCOMMANDS:
+  deps            Install system prerequisites only
+  build           Build and install kt-kernel (no dependency install)
+  all             Run deps then build (default when no subcommand)
   -h, --help      Show this help message
+
+BUILD_OPTIONS (for "build" or "all"):
+  (none)          Auto-detect CPU and configure automatically (recommended)
   --manual        Skip auto-detection, use manual configuration (see below)
+  --skip-deps     Skip deps step even with subcommand "all"
+  --no-clean      Do not delete local build/ before building (default cleans)
 
 AUTO-DETECTION (Default):
   The script will automatically detect your CPU capabilities and configure:
@@ -115,6 +122,66 @@ EOF
   exit 1
 }
 
+install_dependencies() {
+  echo "Checking and installing system dependencies..."
+
+  # Determine if we need to use sudo
+  SUDO=""
+  if [ "${EUID:-0}" -ne 0 ]; then
+    if command -v sudo &> /dev/null; then
+      SUDO="sudo"
+    else
+      echo "Warning: Not running as root and sudo not found. Package installation may fail."
+      echo "Please run as root or install sudo."
+    fi
+  fi
+
+  if command -v conda &> /dev/null; then
+    echo "Installing cmake via conda..."
+    conda install -y cmake
+  else
+    echo "Warning: conda not found. Skipping cmake installation via conda."
+    echo "Please install conda or manually install cmake."
+  fi
+
+  # Detect OS type
+  if [ -f /etc/os-release ]; then
+    . /etc/os-release
+    OS=$ID
+  elif [ -f /etc/debian_version ]; then
+    OS="debian"
+  elif [ -f /etc/redhat-release ]; then
+    OS="rhel"
+  else
+    echo "Warning: Unable to detect OS type. Skipping dependency installation."
+    return 0
+  fi
+
+  # Install dependencies based on OS
+  case "$OS" in
+    debian|ubuntu|linuxmint|pop)
+      echo "Detected Debian-based system. Installing libhwloc-dev and pkg-config..."
+      $SUDO apt update
+      $SUDO apt install -y libhwloc-dev pkg-config
+      ;;
+    fedora|rhel|centos|rocky|almalinux)
+      echo "Detected Red Hat-based system. Installing hwloc-devel and pkgconfig..."
+      $SUDO dnf install -y hwloc-devel pkgconfig || $SUDO yum install -y hwloc-devel pkgconfig
+      ;;
+    arch|manjaro)
+      echo "Detected Arch-based system. Installing hwloc and pkgconf..."
+      $SUDO pacman -S --noconfirm hwloc pkgconf
+      ;;
+    opensuse*|sles)
+      echo "Detected openSUSE-based system. Installing hwloc-devel and pkg-config..."
+      $SUDO zypper install -y hwloc-devel pkg-config
+      ;;
+    *)
+      echo "Warning: Unsupported OS '$OS'. Please manually install libhwloc-dev and pkg-config."
+      ;;
+  esac
+}
+
 # Function to detect CPU features
 detect_cpu_features() {
   local has_amx=0
@@ -132,18 +199,33 @@ detect_cpu_features() {
   echo "$has_amx"
 }
 
-# Check if user requested help
-if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
-  usage
-fi
+build_step() {
+  # Parse build-only flags from arguments to this function
+  local MANUAL_MODE=0
+  local CLEAN_BUILD=1
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --manual) MANUAL_MODE=1; shift ;;
+      --skip-deps) shift ;; # ignore here
+      --no-clean) CLEAN_BUILD=0; shift ;;
+      -h|--help) usage ;;
+      *) break ;;
+    esac
+  done
 
-# Check if manual mode
-MANUAL_MODE=0
-if [ "$1" = "--manual" ]; then
-  MANUAL_MODE=1
-fi
+  # Clean local build directory to ensure a fresh CMake/configure
+  local REPO_ROOT
+  REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+  if [[ "$CLEAN_BUILD" -eq 1 ]]; then
+    if [[ -d "$REPO_ROOT/build" ]]; then
+      echo "Cleaning previous build directory: $REPO_ROOT/build"
+      rm -rf "$REPO_ROOT/build"
+    fi
+  else
+    echo "Skipping clean of $REPO_ROOT/build (requested by --no-clean)"
+  fi
 
-if [ "$MANUAL_MODE" = "0" ]; then
+  if [ "$MANUAL_MODE" = "0" ]; then
   # Auto-detection mode
   echo "=========================================="
   echo "Auto-detecting CPU capabilities..."
@@ -172,7 +254,7 @@ if [ "$MANUAL_MODE" = "0" ]; then
   echo ""
   echo "To use manual configuration instead, run: $0 --manual"
   echo ""
-else
+  else
   # Manual mode - validate user configuration (no exports)
   if [ -z "$CPUINFER_CPU_INSTRUCT" ] || [ -z "$CPUINFER_ENABLE_AMX" ]; then
     echo "Error: Manual mode requires CPUINFER_CPU_INSTRUCT and CPUINFER_ENABLE_AMX to be set."
@@ -216,7 +298,9 @@ else
       fi
     fi
   fi
-fi
+
+# Close MANUAL_MODE conditional
+  fi
 
 # Set defaults for optional variables
 export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
@@ -232,9 +316,31 @@ echo "  CPUINFER_VERBOSE=$CPUINFER_VERBOSE"
 echo ""
 
 pip install . -v
+}
 
+# Subcommand dispatcher: default to "all"
+SUBCMD="all"
+if [[ $# -gt 0 ]]; then
+  case "$1" in
+    deps|build|all) SUBCMD="$1"; shift ;;
+    -h|--help) usage ;;
+    *) SUBCMD="build" ;; # backward compatibility: flags-only => build
+  esac
+fi
 
-echo "Successfully built and installed kt-kernel! with configuration:"
-echo "  CPUINFER_CPU_INSTRUCT=$CPUINFER_CPU_INSTRUCT"
-echo "  CPUINFER_ENABLE_AMX=$CPUINFER_ENABLE_AMX"
-echo "  CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
\ No newline at end of file
+case "$SUBCMD" in
+  deps)
+    install_dependencies
+    ;;
+  build)
+    build_step "$@"
+    ;;
+  all)
+    if [[ " ${*:-} " == *" --skip-deps "* ]]; then
+      build_step "$@"
+    else
+      install_dependencies
+      build_step "$@"
+    fi
+    ;;
+esac