diff --git a/.github/actions/get-tag-name/action.yml b/.github/actions/get-tag-name/action.yml
new file mode 100644
index 000000000..7ace23b2a
--- /dev/null
+++ b/.github/actions/get-tag-name/action.yml
@@ -0,0 +1,22 @@
+name: "Determine tag name"
+description: "Determine the tag name to use for a release"
+outputs:
+  name:
+    description: "The name of the tag"
+    value: ${{ steps.tag.outputs.name }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Determine tag name
+      id: tag
+      shell: bash
+      run: |
+        BUILD_NUMBER="$(git rev-list --count HEAD)"
+        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+        else
+          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+        fi
diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml
new file mode 100644
index 000000000..5575caeca
--- /dev/null
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -0,0 +1,67 @@
+name: "Windows - Setup CUDA Toolkit"
+description: "Setup CUDA Toolkit for Windows"
+inputs:
+  cuda_version:
+    description: "CUDA toolkit version"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install Cuda Toolkit 11.7
+      if: ${{ inputs.cuda_version == '11.7' }}
+      shell: pwsh
+      run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+    - name: Install Cuda Toolkit 12.4
+      if: ${{ inputs.cuda_version == '12.4' }}
+      shell: pwsh
+      run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 000000000..38c8ea439
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,710 @@
+name: Create Release
+
+on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  contents: write # for creating release
+
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
+
+jobs:
+  macOS-arm64:
+    runs-on: macos-14
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-arm64
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: llama-bin-macos-arm64.zip
+
+  macOS-x64:
+    runs-on: macos-13
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-x64
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: llama-bin-macos-x64.zip
+
+  ubuntu-22-cpu:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-22.04-arm
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cpu-cmake
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
+          name: llama-bin-ubuntu-${{ matrix.build }}.zip
+
+  ubuntu-22-vulkan:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-vulkan
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DGGML_VULKAN=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
+          name: llama-bin-ubuntu-vulkan-x64.zip
+
+  windows:
+    runs-on: windows-latest
+
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      VULKAN_VERSION: 1.4.309.0
+
+    strategy:
+      matrix:
+        include:
+          - build: 'cpu-x64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
+          #- build: 'openblas-x64'
+          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'vulkan-x64'
+            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
+          - build: 'cpu-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
+          - build: 'opencl-adreno-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-${{ matrix.build }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'openblas-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/openblas
+          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
+          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
+
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'vulkan-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
+      - name: Build
+        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cmake -S . -B build ${{ matrix.defines }} `
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Add libopenblas.dll
+        id: add_libopenblas_dll
+        if: ${{ matrix.build == 'openblas-x64' }}
+        run: |
+          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
+          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: llama-bin-win-${{ matrix.build }}.zip
+
+  windows-cuda:
+    runs-on: windows-2019
+
+    strategy:
+      matrix:
+        cuda: ['12.4', '11.7']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
+
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-cuda-${{ matrix.cuda }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install Cuda Toolkit
+        uses: ./.github/actions/windows-setup-cuda
+        with:
+          cuda_version: ${{ matrix.cuda }}
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_CPU_ALL_VARIANTS=ON ^
+            -DGGML_CUDA=ON ^
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
+            ${{ env.CMAKE_ARGS }}
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
+          cmake --build build --config Release
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+        run: |
+          echo "Cuda install location: ${{ env.CUDA_PATH }}"
+          $dst='.\build\bin\cudart\'
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
+
+      - name: Upload Cuda runtime
+        uses: actions/upload-artifact@v4
+        with:
+          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+
+  windows-sycl:
+    runs-on: windows-latest
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-sycl
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Build the release package
+        id: pack_artifacts
+        run: |
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
+          echo "cp oneAPI running time dll files to ./build/bin done"
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+
+      - name: Upload the release package
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          name: llama-bin-win-sycl-x64.zip
+
+  windows-hip:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        gpu_target: [gfx1100, gfx1101, gfx1030]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
+
+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
+        run: |
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-release
+          evict-old-files: 1d
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
+      - name: Build
+        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_HIP=ON `
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+
+  ios-xcode-build:
+    runs-on: macos-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TOOLS=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_SYSTEM_NAME=iOS \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+
+      - name: xcodebuild for swift package
+        id: xcodebuild
+        run: |
+          ./build-xcframework.sh
+
+      - name: Build Xcode project
+        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
+          name: llama-${{ steps.tag.outputs.name }}-xcframework
+
+  release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
+      - windows
+      - windows-cuda
+      - windows-sycl
+      - windows-hip
+      - macOS-arm64
+      - macOS-x64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
+
+      - name: Create release
+        id: create_release
+        uses: ggml-org/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.tag.outputs.name }}
+
+      - name: Upload release
+        id: upload_release
+        uses: actions/github-script@v3
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./artifact/release')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
+                });
+              }
+            }
diff --git a/common/arg.cpp b/common/arg.cpp
index bbb70ccbe..d41c66611 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2098,13 +2098,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.cache_type_v = kv_cache_type_from_str(value);
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
-    add_opt(common_arg(
-        {"--perplexity", "--all-logits"},
-        string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
-        [](common_params & params) {
-            params.logits_all = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--hellaswag"},
         "compute HellaSwag score over random tasks from datafile supplied with -f",
diff --git a/common/chat.cpp b/common/chat.cpp
index bbc5f087c..ad3d4aa99 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -125,7 +125,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
             msgs.push_back(msg);
         }
     } catch (const std::exception & e) {
-        throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
+        // @ngxson : disable otherwise it's bloating the API response
+        // printf("%s\n", std::string("; messages = ") + messages.dump(2));
+        throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
     }
 
     return msgs;
diff --git a/common/common.cpp b/common/common.cpp
index ea71ec104..21ad0a8eb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1103,7 +1103,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.n_threads         = params.cpuparams.n_threads;
     cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
                                 params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
-    cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
     cparams.rope_freq_base    = params.rope_freq_base;
diff --git a/common/common.h b/common/common.h
index ff6225389..dfaaa6026 100644
--- a/common/common.h
+++ b/common/common.h
@@ -320,7 +320,6 @@ struct common_params {
     bool ctx_shift         = true;  // context shift on inifinite text generation
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a6aaf8834..bf6bc6838 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1388,10 +1388,10 @@ class BaichuanModel(TextModel):
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
 
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         head_count = self.hparams["num_attention_heads"]
@@ -1512,10 +1512,10 @@ class XverseModel(TextModel):
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
 
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
@@ -1828,10 +1828,10 @@ class LlamaModel(TextModel):
             rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(rope_dim)
 
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
 
     @staticmethod
     def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -2206,10 +2206,10 @@ class DeciModel(TextModel):
             rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(rope_dim)
 
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
 
     @staticmethod
     def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -2449,10 +2449,10 @@ class MiniCPMModel(TextModel):
         logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
         self.gguf_writer.add_logit_scale(logit_scale)
         logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
-        if self.hparams.get("rope_scaling") is not None:
-            if self.hparams["rope_scaling"].get("type") == "longrope":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
-                logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
+            logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
 
     def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
@@ -2597,11 +2597,11 @@ class Qwen2Model(TextModel):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self._try_set_pooling_type()
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         if self.hf_arch == "Qwen2Model":
@@ -2763,11 +2763,11 @@ class Qwen2MoeModel(TextModel):
             logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
         # YaRN is not enabled by default
         # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
 
     _experts: list[dict[str, Tensor]] | None = None
 
@@ -3035,7 +3035,7 @@ class Phi3MiniModel(TextModel):
 
         scale = max_pos_embds / orig_max_pos_embds
 
-        rope_scaling_type = rope_scaling.get('type', '').lower()
+        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
         if len(rope_scaling_type) == 0:
             raise KeyError('Missing the required key rope_scaling.type')
 
@@ -3347,10 +3347,10 @@ class InternLM2Model(TextModel):
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
         self.gguf_writer.add_file_type(self.ftype)
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         num_heads = self.hparams["num_attention_heads"]
@@ -3425,10 +3425,10 @@ class InternLM3Model(TextModel):
             rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(rope_dim)
 
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         n_head = self.hparams["num_attention_heads"]
@@ -4866,12 +4866,12 @@ class DeepseekV2Model(TextModel):
 
         self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
-                self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
 
     _experts: list[dict[str, Tensor]] | None = None
 
@@ -5363,11 +5363,11 @@ class Glm4Model(TextModel):
         super().set_gguf_parameters()
         rope_dim = self.hparams["head_dim"]
         self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
 
 
 @ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
@@ -5600,10 +5600,10 @@ class ExaoneModel(TextModel):
         rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
         rotary_factor = rotary_factor if rotary_factor is not None else 1.0
         self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
-        if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
-            if hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
 
     def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
@@ -5706,10 +5706,11 @@ class BailingMoeModel(TextModel):
         rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
 
         self.gguf_writer.add_rope_dimension_count(rope_dim)
-        if (self.hparams.get("rope_scaling") or {}).get("type") == "yarn" and "factor" in self.hparams["rope_scaling"]:
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
             self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
         else:
             self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
         self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index a84a7ffde..594439ea5 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -559,7 +559,6 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
     draft_model_params.use_mlock = base_model_params.use_mlock;
     draft_model_params.n_gpu_layers = draft_gpulayers; //layers offload the speculative model.
     draft_ctx_params.n_ctx = base_ctx_params.n_ctx;
-    draft_ctx_params.logits_all = false;
     draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv;
     draft_model_params.main_gpu = base_model_params.main_gpu;
     draft_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
@@ -2147,7 +2146,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
 
         llama_ctx_params.offload_kqv = !inputs.low_vram;
-        llama_ctx_params.logits_all = false;
         model_params.use_mmap = inputs.use_mmap;
         model_params.use_mlock = inputs.use_mlock;
         model_params.n_gpu_layers = inputs.gpulayers;
diff --git a/include/llama.h b/include/llama.h
index 0d35441c6..eb881b035 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -353,19 +353,17 @@ extern "C" {
         enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
         enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
 
-        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
-        // TODO: move at the end of the struct
-        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
-
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
         // currently works only with CPU execution
         ggml_abort_callback abort_callback;
         void *              abort_callback_data;
+
+        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+        bool embeddings;  // if true, extract embeddings (together with logits)
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+        bool no_perf;     // whether to measure performance timings
     };
 
     // model quantization parameters
@@ -926,14 +924,19 @@ extern "C" {
     // Frees a batch of tokens allocated with llama_batch_init()
     LLAMA_API void llama_batch_free(struct llama_batch batch);
 
-    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
-    // Stores the encoder output internally for later use by the decoder cross-attention layers.
+    // Process a batch of tokens.
+    // In contrast to llama_decode() - this call does not use KV cache.
+    // For encode-decoder contexts, processes the batch using the encoder.
+    // Can store the encoder output internally for later use by the decoder's cross-attention layers.
     //   0 - success
     // < 0 - error. the KV cache state is restored to the state before this call
     LLAMA_API int32_t llama_encode(
             struct llama_context * ctx,
               struct llama_batch   batch);
 
+    // Process a batch of tokens.
+    // Requires KV cache.
+    // For encode-decoder contexts, processes the batch using the decoder.
     // Positive return values does not mean a fatal error, but rather a warning.
     //   0 - success
     //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp
index 428bbc633..6797a3d86 100644
--- a/otherarch/embeddings_adapter.cpp
+++ b/otherarch/embeddings_adapter.cpp
@@ -136,7 +136,6 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
     ctx_params.embeddings = true;
     ctx_params.n_ubatch = ctx_params.n_ubatch = max_batchsize; //max size, must fit
     ctx_params.n_ctx = max_batchsize;
-    ctx_params.logits_all = false;
     ctx_params.offload_kqv = true;
     ctx_params.n_threads = nthreads;
     ctx_params.n_threads_batch = nthreads;
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
index 336d70e9f..57e50b319 100644
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@@ -533,7 +533,6 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
     tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
     tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
     tts_ctx_params.n_ctx = 8192;
-    tts_ctx_params.logits_all = false;
     tts_ctx_params.offload_kqv = true;
     tts_ctx_params.n_batch = 8192;
     tts_ctx_params.n_ubatch = 512;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 2fb16ec9e..9171d8ad6 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -116,8 +116,6 @@ llama_context::llama_context(
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
     }
 
-    logits_all = params.logits_all;
-
     if (!hparams.vocab_only) {
         // GPU backends
         for (auto * dev : model.devices) {
@@ -253,7 +251,7 @@ llama_context::llama_context(
     }
 
     // reserve worst-case graph
-    if (!hparams.vocab_only) {
+    if (!hparams.vocab_only && memory) {
         const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
@@ -702,6 +700,8 @@ int llama_context::encode(llama_batch & inp_batch) {
         t_compute_start_us = ggml_time_us();
     }
 
+    embd_seq.clear();
+
     n_queued_tokens += n_tokens;
 
     const int64_t n_embd = hparams.n_embd;
@@ -763,12 +763,12 @@ int llama_context::encode(llama_batch & inp_batch) {
         ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
         GGML_ASSERT(backend_embd != nullptr);
 
-        GGML_ASSERT(embd != nullptr);
-
         switch (cparams.pooling_type) {
             case LLAMA_POOLING_TYPE_NONE:
                 {
                     // extract token embeddings
+                    GGML_ASSERT(embd != nullptr);
+
                     GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
                     ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
                 } break;
@@ -793,11 +793,18 @@ int llama_context::encode(llama_batch & inp_batch) {
                 } break;
             case LLAMA_POOLING_TYPE_RANK:
                 {
-                    // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                    //       wait for an encoder model that requires this pooling type in order to test it
-                    //       https://github.com/ggerganov/llama.cpp/pull/9510
-                    GGML_ABORT("RANK pooling not implemented yet");
-                }
+                    // extract the rerank score - a single float per sequence
+                    auto & embd_seq_out = embd_seq;
+
+                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(1);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                    }
+                } break;
             case LLAMA_POOLING_TYPE_UNSPECIFIED:
                 {
                     GGML_ABORT("unknown pooling type");
@@ -835,6 +842,11 @@ int llama_context::encode(llama_batch & inp_batch) {
 }
 
 int llama_context::decode(llama_batch & inp_batch) {
+    if (!memory) {
+        LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
+        return encode(inp_batch);
+    }
+
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
@@ -890,7 +902,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
             n_outputs_all += batch.logits[i] != 0;
         }
-    } else if (logits_all || embd_pooled) {
+    } else if (embd_pooled) {
         n_outputs_all = n_tokens_all;
     } else {
         // keep last output only
@@ -1853,13 +1865,12 @@ llama_context_params llama_context_default_params() {
         /*.cb_eval_user_data           =*/ nullptr,
         /*.type_k                      =*/ GGML_TYPE_F16,
         /*.type_v                      =*/ GGML_TYPE_F16,
-        /*.logits_all                  =*/ false,
+        /*.abort_callback              =*/ nullptr,
+        /*.abort_callback_data         =*/ nullptr,
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
-        /*.abort_callback              =*/ nullptr,
-        /*.abort_callback_data         =*/ nullptr,
     };
 
     return result;
diff --git a/src/llama-context.h b/src/llama-context.h
index cf41ac57b..5a080e67f 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -187,9 +187,6 @@ private:
 
     std::unique_ptr<llama_memory_i> memory;
 
-    // TODO: remove
-    bool logits_all = false;
-
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
     float * logits      = nullptr;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ae187b93f..2a5d2abd2 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1666,8 +1666,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
                     std::regex pattern(overrides->pattern);
                     if (std::regex_search(tensor_name, pattern)) {
-                        LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
                         buft = overrides->buft;
+                        LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
+                                tensor_name.c_str(),
+                                ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
+                                ggml_backend_buft_name(buft));
                         break;
                     }
                 }
@@ -12952,6 +12955,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
     llama_memory_i * res;
 
     switch (arch) {
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
+            {
+                res = nullptr;
+            } break;
         case LLM_ARCH_MAMBA:
         case LLM_ARCH_RWKV6:
         case LLM_ARCH_RWKV6QWEN2:
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index 0251edca5..d3c88c45e 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -100,14 +100,6 @@ int main(int argc, char ** argv) {
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
-    if (params.logits_all) {
-        LOG_ERR("************\n");
-        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        LOG_ERR("************\n\n");
-
-        return 0;
-    }
-
     if (params.embedding) {
         LOG_ERR("************\n");
         LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index d68b65b8b..1f86a8b2d 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3151,7 +3151,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
         n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
     } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
-        n_patches /= params.proj_scale_factor;
+        n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
     } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
         int n_merge = params.spatial_merge_size;
         int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index 82d2e3b97..3153b6159 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index e0e99eafc..06788bbdc 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3214,7 +3214,14 @@ struct server_context {
                 batch.logits   + i,
             };
 
-            const int ret = llama_decode(ctx, batch_view);
+            int ret = 0;
+
+            if (params_base.embedding || params_base.reranking) {
+                ret = llama_encode(ctx, batch_view);
+            } else {
+                ret = llama_decode(ctx, batch_view);
+            }
+
             metrics.on_decoded(slots);
 
             if (ret != 0) {
@@ -3943,7 +3950,7 @@ int main(int argc, char ** argv) {
     const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
             server_task_type type,
             json & data,
-            std::function<bool()> is_connection_closed,
+            const std::function<bool()> & is_connection_closed,
             httplib::Response & res,
             oaicompat_type oaicompat) {
         GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json
index b2e3cf94a..2c23a7580 100644
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@@ -21,6 +21,8 @@
         "postcss": "^8.4.49",
         "react": "^18.3.1",
         "react-dom": "^18.3.1",
+        "react-dropzone": "^14.3.8",
+        "react-hot-toast": "^2.5.2",
         "react-markdown": "^9.0.3",
         "react-router": "^7.1.5",
         "rehype-highlight": "^7.0.2",
@@ -2058,6 +2060,15 @@
       "dev": true,
       "license": "Python-2.0"
     },
+    "node_modules/attr-accept": {
+      "version": "2.2.5",
+      "resolved": "https://registry.npmjs.org/attr-accept/-/attr-accept-2.2.5.tgz",
+      "integrity": "sha512-0bDNnY/u6pPwHDMoF0FieU354oBi0a8rD9FcsLwzcGWbc8KS8KPIi7y+s13OlVY+gMWc/9xEMUgNE6Qm8ZllYQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/autoprefixer": {
       "version": "10.4.20",
       "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz",
@@ -2804,6 +2815,18 @@
         "node": ">=16.0.0"
       }
     },
+    "node_modules/file-selector": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/file-selector/-/file-selector-2.1.2.tgz",
+      "integrity": "sha512-QgXo+mXTe8ljeqUFaX3QVHc5osSItJ/Km+xpocx0aSqWGMSCf6qYs/VnzZgS864Pjn5iceMRFigeAV7AfTlaig==",
+      "license": "MIT",
+      "dependencies": {
+        "tslib": "^2.7.0"
+      },
+      "engines": {
+        "node": ">= 12"
+      }
+    },
     "node_modules/fill-range": {
       "version": "7.1.1",
       "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
@@ -2917,6 +2940,15 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/goober": {
+      "version": "2.1.16",
+      "resolved": "https://registry.npmjs.org/goober/-/goober-2.1.16.tgz",
+      "integrity": "sha512-erjk19y1U33+XAMe1VTvIONHYoSqE4iS7BYUZfHaqeohLmnC0FdxEh7rQU+6MZ4OajItzjZFSRtVANrQwNq6/g==",
+      "license": "MIT",
+      "peerDependencies": {
+        "csstype": "^3.0.10"
+      }
+    },
     "node_modules/graceful-fs": {
       "version": "4.2.11",
       "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
@@ -4674,6 +4706,15 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/object-assign": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+      "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/optionator": {
       "version": "0.9.4",
       "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
@@ -4872,6 +4913,17 @@
         "url": "https://github.com/prettier/prettier?sponsor=1"
       }
     },
+    "node_modules/prop-types": {
+      "version": "15.8.1",
+      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
+      "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.4.0",
+        "object-assign": "^4.1.1",
+        "react-is": "^16.13.1"
+      }
+    },
     "node_modules/property-information": {
       "version": "6.5.0",
       "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
@@ -4938,6 +4990,46 @@
         "react": "^18.3.1"
       }
     },
+    "node_modules/react-dropzone": {
+      "version": "14.3.8",
+      "resolved": "https://registry.npmjs.org/react-dropzone/-/react-dropzone-14.3.8.tgz",
+      "integrity": "sha512-sBgODnq+lcA4P296DY4wacOZz3JFpD99fp+hb//iBO2HHnyeZU3FwWyXJ6salNpqQdsZrgMrotuko/BdJMV8Ug==",
+      "license": "MIT",
+      "dependencies": {
+        "attr-accept": "^2.2.4",
+        "file-selector": "^2.1.0",
+        "prop-types": "^15.8.1"
+      },
+      "engines": {
+        "node": ">= 10.13"
+      },
+      "peerDependencies": {
+        "react": ">= 16.8 || 18.0.0"
+      }
+    },
+    "node_modules/react-hot-toast": {
+      "version": "2.5.2",
+      "resolved": "https://registry.npmjs.org/react-hot-toast/-/react-hot-toast-2.5.2.tgz",
+      "integrity": "sha512-Tun3BbCxzmXXM7C+NI4qiv6lT0uwGh4oAfeJyNOjYUejTsm35mK9iCaYLGv8cBz9L5YxZLx/2ii7zsIwPtPUdw==",
+      "license": "MIT",
+      "dependencies": {
+        "csstype": "^3.1.3",
+        "goober": "^2.1.16"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "peerDependencies": {
+        "react": ">=16",
+        "react-dom": ">=16"
+      }
+    },
+    "node_modules/react-is": {
+      "version": "16.13.1",
+      "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
+      "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
+      "license": "MIT"
+    },
     "node_modules/react-markdown": {
       "version": "9.0.3",
       "resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-9.0.3.tgz",
@@ -5814,7 +5906,6 @@
       "version": "2.8.1",
       "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
       "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
-      "devOptional": true,
       "license": "0BSD"
     },
     "node_modules/turbo-stream": {
diff --git a/tools/server/webui/package.json b/tools/server/webui/package.json
index 6ac06b1a4..ab1b920bd 100644
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@@ -24,6 +24,8 @@
     "postcss": "^8.4.49",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
+    "react-dropzone": "^14.3.8",
+    "react-hot-toast": "^2.5.2",
     "react-markdown": "^9.0.3",
     "react-router": "^7.1.5",
     "rehype-highlight": "^7.0.2",
diff --git a/tools/server/webui/src/App.tsx b/tools/server/webui/src/App.tsx
index cc4659e15..3b00a8f90 100644
--- a/tools/server/webui/src/App.tsx
+++ b/tools/server/webui/src/App.tsx
@@ -4,6 +4,7 @@ import Sidebar from './components/Sidebar';
 import { AppContextProvider, useAppContext } from './utils/app.context';
 import ChatScreen from './components/ChatScreen';
 import SettingDialog from './components/SettingDialog';
+import { Toaster } from 'react-hot-toast';
 
 function App() {
   return (
@@ -40,6 +41,7 @@ function AppLayout() {
           onClose={() => setShowSettings(false)}
         />
       }
+      <Toaster />
     </>
   );
 }
diff --git a/tools/server/webui/src/Config.ts b/tools/server/webui/src/Config.ts
index dd1cc0e10..5eef608cb 100644
--- a/tools/server/webui/src/Config.ts
+++ b/tools/server/webui/src/Config.ts
@@ -12,7 +12,7 @@ export const CONFIG_DEFAULT = {
   // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
   // Do not use nested objects, keep it single level. Prefix the key if you need to group them.
   apiKey: '',
-  systemMessage: 'You are a helpful assistant.',
+  systemMessage: '',
   showTokensPerSecond: false,
   showThoughtInProgress: false,
   excludeThoughtOnReq: true,
diff --git a/tools/server/webui/src/components/ChatInputExtraContextItem.tsx b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
new file mode 100644
index 000000000..ac416fa90
--- /dev/null
+++ b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
@@ -0,0 +1,92 @@
+import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline';
+import { MessageExtra } from '../utils/types';
+import { useState } from 'react';
+import { classNames } from '../utils/misc';
+
+export default function ChatInputExtraContextItem({
+  items,
+  removeItem,
+  clickToShow,
+}: {
+  items?: MessageExtra[];
+  removeItem?: (index: number) => void;
+  clickToShow?: boolean;
+}) {
+  const [show, setShow] = useState(-1);
+  const showingItem = show >= 0 ? items?.[show] : undefined;
+
+  if (!items) return null;
+
+  return (
+    <div className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1">
+      {items.map((item, i) => (
+        <div
+          className="indicator"
+          key={i}
+          onClick={() => clickToShow && setShow(i)}
+        >
+          {removeItem && (
+            <div className="indicator-item indicator-top">
+              <button
+                className="btn btn-neutral btn-sm w-4 h-4 p-0 rounded-full"
+                onClick={() => removeItem(i)}
+              >
+                <XMarkIcon className="h-3 w-3" />
+              </button>
+            </div>
+          )}
+
+          <div
+            className={classNames({
+              'flex flex-row rounded-md shadow-sm items-center m-0 p-0': true,
+              'cursor-pointer hover:shadow-md': !!clickToShow,
+            })}
+          >
+            {item.type === 'imageFile' ? (
+              <>
+                <img
+                  src={item.base64Url}
+                  alt={item.name}
+                  className="w-14 h-14 object-cover rounded-md"
+                />
+              </>
+            ) : (
+              <>
+                <div className="w-14 h-14 flex items-center justify-center">
+                  <DocumentTextIcon className="h-8 w-14 text-base-content/50" />
+                </div>
+
+                <div className="text-xs pr-4">
+                  <b>{item.name ?? 'Extra content'}</b>
+                </div>
+              </>
+            )}
+          </div>
+        </div>
+      ))}
+
+      {showingItem && (
+        <dialog className="modal modal-open">
+          <div className="modal-box">
+            <div className="flex justify-between items-center mb-4">
+              <b>{showingItem.name ?? 'Extra content'}</b>
+              <button className="btn btn-ghost btn-sm">
+                <XMarkIcon className="h-5 w-5" onClick={() => setShow(-1)} />
+              </button>
+            </div>
+            {showingItem.type === 'imageFile' ? (
+              <img src={showingItem.base64Url} alt={showingItem.name} />
+            ) : (
+              <div className="overflow-x-auto">
+                <pre className="whitespace-pre-wrap break-words text-sm">
+                  {showingItem.content}
+                </pre>
+              </div>
+            )}
+          </div>
+          <div className="modal-backdrop" onClick={() => setShow(-1)}></div>
+        </dialog>
+      )}
+    </div>
+  );
+}
diff --git a/tools/server/webui/src/components/ChatMessage.tsx b/tools/server/webui/src/components/ChatMessage.tsx
index 40ea74711..08eb42352 100644
--- a/tools/server/webui/src/components/ChatMessage.tsx
+++ b/tools/server/webui/src/components/ChatMessage.tsx
@@ -3,7 +3,14 @@ import { useAppContext } from '../utils/app.context';
 import { Message, PendingMessage } from '../utils/types';
 import { classNames } from '../utils/misc';
 import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
-import { ChevronLeftIcon, ChevronRightIcon } from '@heroicons/react/24/outline';
+import {
+  ArrowPathIcon,
+  ChevronLeftIcon,
+  ChevronRightIcon,
+  PencilSquareIcon,
+} from '@heroicons/react/24/outline';
+import ChatInputExtraContextItem from './ChatInputExtraContextItem';
+import { BtnWithTooltips } from '../utils/common';
 
 interface SplitMessage {
   content: PendingMessage['content'];
@@ -85,10 +92,14 @@ export default function ChatMessage({
           'chat-end': msg.role === 'user',
         })}
       >
+        {msg.extra && msg.extra.length > 0 && (
+          <ChatInputExtraContextItem items={msg.extra} clickToShow />
+        )}
+
         <div
           className={classNames({
             'chat-bubble markdown': true,
-            'chat-bubble-base-300': msg.role !== 'user',
+            'chat-bubble bg-transparent': msg.role !== 'user',
           })}
         >
           {/* textarea for editing message */}
@@ -133,59 +144,11 @@ export default function ChatMessage({
                   {/* render message as markdown */}
                   <div dir="auto">
                     {thought && (
-                      <details
-                        className="collapse bg-base-200 collapse-arrow mb-4"
-                        open={isThinking && config.showThoughtInProgress}
-                      >
-                        <summary className="collapse-title">
-                          {isPending && isThinking ? (
-                            <span>
-                              <span
-                                v-if="isGenerating"
-                                className="loading loading-spinner loading-md mr-2"
-                                style={{ verticalAlign: 'middle' }}
-                              ></span>
-                              <b>Thinking</b>
-                            </span>
-                          ) : (
-                            <b>Thought Process</b>
-                          )}
-                        </summary>
-                        <div className="collapse-content">
-                          <MarkdownDisplay
-                            content={thought}
-                            isGenerating={isPending}
-                          />
-                        </div>
-                      </details>
-                    )}
-
-                    {msg.extra && msg.extra.length > 0 && (
-                      <details
-                        className={classNames({
-                          'collapse collapse-arrow mb-4 bg-base-200': true,
-                          'bg-opacity-10': msg.role !== 'assistant',
-                        })}
-                      >
-                        <summary className="collapse-title">
-                          Extra content
-                        </summary>
-                        <div className="collapse-content">
-                          {msg.extra.map(
-                            (extra, i) =>
-                              extra.type === 'textFile' ? (
-                                <div key={extra.name}>
-                                  <b>{extra.name}</b>
-                                  <pre>{extra.content}</pre>
-                                </div>
-                              ) : extra.type === 'context' ? (
-                                <div key={i}>
-                                  <pre>{extra.content}</pre>
-                                </div>
-                              ) : null // TODO: support other extra types
-                          )}
-                        </div>
-                      </details>
+                      <ThoughtProcess
+                        isThinking={!!isThinking && !!isPending}
+                        content={thought}
+                        open={config.showThoughtInProgress}
+                      />
                     )}
 
                     <MarkdownDisplay
@@ -259,34 +222,36 @@ export default function ChatMessage({
           )}
           {/* user message */}
           {msg.role === 'user' && (
-            <button
-              className="badge btn-mini show-on-hover"
+            <BtnWithTooltips
+              className="btn-mini show-on-hover w-8 h-8"
               onClick={() => setEditingContent(msg.content)}
               disabled={msg.content === null}
+              tooltipsContent="Edit message"
             >
-              ✍️ Edit
-            </button>
+              <PencilSquareIcon className="h-4 w-4" />
+            </BtnWithTooltips>
           )}
           {/* assistant message */}
           {msg.role === 'assistant' && (
             <>
               {!isPending && (
-                <button
-                  className="badge btn-mini show-on-hover mr-2"
+                <BtnWithTooltips
+                  className="btn-mini show-on-hover w-8 h-8"
                   onClick={() => {
                     if (msg.content !== null) {
                       onRegenerateMessage(msg as Message);
                     }
                   }}
                   disabled={msg.content === null}
+                  tooltipsContent="Regenerate response"
                 >
-                  🔄 Regenerate
-                </button>
+                  <ArrowPathIcon className="h-4 w-4" />
+                </BtnWithTooltips>
               )}
             </>
           )}
           <CopyButton
-            className="badge btn-mini show-on-hover mr-2"
+            className="btn-mini show-on-hover w-8 h-8"
             content={msg.content}
           />
         </div>
@@ -294,3 +259,44 @@ export default function ChatMessage({
     </div>
   );
 }
+
+function ThoughtProcess({
+  isThinking,
+  content,
+  open,
+}: {
+  isThinking: boolean;
+  content: string;
+  open: boolean;
+}) {
+  return (
+    <div
+      tabIndex={0}
+      className={classNames({
+        'collapse bg-none': true,
+      })}
+    >
+      <input type="checkbox" defaultChecked={open} />
+      <div className="collapse-title px-0">
+        <div className="btn rounded-xl">
+          {isThinking ? (
+            <span>
+              <span
+                className="loading loading-spinner loading-md mr-2"
+                style={{ verticalAlign: 'middle' }}
+              ></span>
+              Thinking
+            </span>
+          ) : (
+            <>Thought Process</>
+          )}
+        </div>
+      </div>
+      <div className="collapse-content text-base-content/70 text-sm p-1">
+        <div className="border-l-2 border-base-content/20 pl-4 mb-4">
+          <MarkdownDisplay content={content} />
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/tools/server/webui/src/components/ChatScreen.tsx b/tools/server/webui/src/components/ChatScreen.tsx
index a2e3ee997..b645a494d 100644
--- a/tools/server/webui/src/components/ChatScreen.tsx
+++ b/tools/server/webui/src/components/ChatScreen.tsx
@@ -1,12 +1,25 @@
-import { useEffect, useMemo, useState } from 'react';
+import { useEffect, useMemo, useRef, useState } from 'react';
 import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
 import ChatMessage from './ChatMessage';
 import { CanvasType, Message, PendingMessage } from '../utils/types';
-import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
+import { classNames, cleanCurrentUrl } from '../utils/misc';
 import CanvasPyInterpreter from './CanvasPyInterpreter';
 import StorageUtils from '../utils/storage';
 import { useVSCodeContext } from '../utils/llama-vscode';
 import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
+import {
+  ArrowUpIcon,
+  StopIcon,
+  PaperClipIcon,
+} from '@heroicons/react/24/solid';
+import {
+  ChatExtraContextApi,
+  useChatExtraContext,
+} from './useChatExtraContext.tsx';
+import Dropzone from 'react-dropzone';
+import toast from 'react-hot-toast';
+import ChatInputExtraContextItem from './ChatInputExtraContextItem.tsx';
+import { scrollToBottom, useChatScroll } from './useChatScroll.tsx';
 
 /**
  * A message display is a message node with additional information for rendering.
@@ -72,24 +85,6 @@ function getListMessageDisplay(
   return res;
 }
 
-const scrollToBottom = throttle(
-  (requiresNearBottom: boolean, delay: number = 80) => {
-    const mainScrollElem = document.getElementById('main-scroll');
-    if (!mainScrollElem) return;
-    const spaceToBottom =
-      mainScrollElem.scrollHeight -
-      mainScrollElem.scrollTop -
-      mainScrollElem.clientHeight;
-    if (!requiresNearBottom || spaceToBottom < 50) {
-      setTimeout(
-        () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
-        delay
-      );
-    }
-  },
-  80
-);
-
 export default function ChatScreen() {
   const {
     viewingChat,
@@ -102,10 +97,11 @@ export default function ChatScreen() {
   } = useAppContext();
 
   const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());
+  const extraContext = useChatExtraContext();
+  useVSCodeContext(textarea, extraContext);
 
-  const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
-  // TODO: improve this when we have "upload file" feature
-  const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
+  const msgListRef = useRef<HTMLDivElement>(null);
+  useChatScroll(msgListRef);
 
   // keep track of leaf node for rendering
   const [currNodeId, setCurrNodeId] = useState<number>(-1);
@@ -129,13 +125,15 @@ export default function ChatScreen() {
     if (currLeafNodeId) {
       setCurrNodeId(currLeafNodeId);
     }
-    scrollToBottom(true);
+    // useChatScroll will handle the auto scroll
   };
 
   const sendNewMessage = async () => {
     const lastInpMsg = textarea.value();
-    if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
+    if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? '')) {
+      toast.error('Please enter a message');
       return;
+    }
     textarea.setValue('');
     scrollToBottom(false);
     setCurrNodeId(-1);
@@ -146,7 +144,7 @@ export default function ChatScreen() {
         currConvId,
         lastMsgNodeId,
         lastInpMsg,
-        currExtra,
+        extraContext.items,
         onChunk
       ))
     ) {
@@ -154,7 +152,7 @@ export default function ChatScreen() {
       textarea.setValue(lastInpMsg);
     }
     // OK
-    clearExtraContext();
+    extraContext.clearItems();
   };
 
   // for vscode context
@@ -234,10 +232,17 @@ export default function ChatScreen() {
         })}
       >
         {/* chat messages */}
-        <div id="messages-list" className="grow">
-          <div className="mt-auto flex justify-center">
+        <div id="messages-list" className="grow" ref={msgListRef}>
+          <div className="mt-auto flex flex-col items-center">
             {/* placeholder to shift the message to the bottom */}
-            {viewingChat ? '' : 'Send a message to start'}
+            {viewingChat ? (
+              ''
+            ) : (
+              <>
+                <div className="mb-4">Send a message to start</div>
+                <ServerInfo />
+              </>
+            )}
           </div>
           {[...messages, ...pendingMsgDisplay].map((msg) => (
             <ChatMessage
@@ -248,46 +253,19 @@ export default function ChatScreen() {
               onRegenerateMessage={handleRegenerateMessage}
               onEditMessage={handleEditMessage}
               onChangeSibling={setCurrNodeId}
+              isPending={msg.isPending}
             />
           ))}
         </div>
 
         {/* chat input */}
-        <div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
-          <textarea
-            // Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
-            // Large screens (lg:): Disable manual resize, apply max-height for autosize limit
-            className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
-            placeholder="Type a message (Shift+Enter to add a new line)"
-            ref={textarea.ref}
-            onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
-            onKeyDown={(e) => {
-              if (e.nativeEvent.isComposing || e.keyCode === 229) return;
-              if (e.key === 'Enter' && !e.shiftKey) {
-                e.preventDefault();
-                sendNewMessage();
-              }
-            }}
-            id="msg-input"
-            dir="auto"
-            // Set a base height of 2 rows for mobile views
-            // On lg+ screens, the hook will calculate and set the initial height anyway
-            rows={2}
-          ></textarea>
-
-          {isGenerating(currConvId ?? '') ? (
-            <button
-              className="btn btn-neutral ml-2"
-              onClick={() => stopGenerating(currConvId ?? '')}
-            >
-              Stop
-            </button>
-          ) : (
-            <button className="btn btn-primary ml-2" onClick={sendNewMessage}>
-              Send
-            </button>
-          )}
-        </div>
+        <ChatInput
+          textarea={textarea}
+          extraContext={extraContext}
+          onSend={sendNewMessage}
+          onStop={() => stopGenerating(currConvId ?? '')}
+          isGenerating={isGenerating(currConvId ?? '')}
+        />
       </div>
       <div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
         {canvasData?.type === CanvasType.PY_INTERPRETER && (
@@ -297,3 +275,129 @@ export default function ChatScreen() {
     </div>
   );
 }
+
+function ServerInfo() {
+  const { serverProps } = useAppContext();
+  return (
+    <div className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6">
+      <div className="card-body">
+        <b>Server Info</b>
+        <p>
+          <b>Model</b>: {serverProps?.model_path?.split(/(\\|\/)/).pop()}
+          <br />
+          <b>Build</b>: {serverProps?.build_info}
+          <br />
+        </p>
+      </div>
+    </div>
+  );
+}
+
+function ChatInput({
+  textarea,
+  extraContext,
+  onSend,
+  onStop,
+  isGenerating,
+}: {
+  textarea: ChatTextareaApi;
+  extraContext: ChatExtraContextApi;
+  onSend: () => void;
+  onStop: () => void;
+  isGenerating: boolean;
+}) {
+  const [isDrag, setIsDrag] = useState(false);
+
+  return (
+    <div
+      className={classNames({
+        'flex items-end pt-8 pb-6 sticky bottom-0 bg-base-100': true,
+        'opacity-50': isDrag, // simply visual feedback to inform user that the file will be accepted
+      })}
+    >
+      <Dropzone
+        noClick
+        onDrop={(files: File[]) => {
+          setIsDrag(false);
+          extraContext.onFileAdded(files);
+        }}
+        onDragEnter={() => setIsDrag(true)}
+        onDragLeave={() => setIsDrag(false)}
+        multiple={true}
+      >
+        {({ getRootProps, getInputProps }) => (
+          <div
+            className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
+            {...getRootProps()}
+          >
+            {!isGenerating && (
+              <ChatInputExtraContextItem
+                items={extraContext.items}
+                removeItem={extraContext.removeItem}
+              />
+            )}
+
+            <div className="flex flex-row w-full">
+              <textarea
+                // Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
+                // Large screens (lg:): Disable manual resize, apply max-height for autosize limit
+                className="text-md outline-none border-none w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
+                placeholder="Type a message (Shift+Enter to add a new line)"
+                ref={textarea.ref}
+                onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
+                onKeyDown={(e) => {
+                  if (e.nativeEvent.isComposing || e.keyCode === 229) return;
+                  if (e.key === 'Enter' && !e.shiftKey) {
+                    e.preventDefault();
+                    onSend();
+                  }
+                }}
+                id="msg-input"
+                dir="auto"
+                // Set a base height of 2 rows for mobile views
+                // On lg+ screens, the hook will calculate and set the initial height anyway
+                rows={2}
+              ></textarea>
+
+              {/* buttons area */}
+              <div className="flex flex-row gap-2 ml-2">
+                <label
+                  htmlFor="file-upload"
+                  className={classNames({
+                    'btn w-8 h-8 p-0 rounded-full': true,
+                    'btn-disabled': isGenerating,
+                  })}
+                >
+                  <PaperClipIcon className="h-5 w-5" />
+                </label>
+                <input
+                  id="file-upload"
+                  type="file"
+                  className="hidden"
+                  disabled={isGenerating}
+                  {...getInputProps()}
+                  hidden
+                />
+                {isGenerating ? (
+                  <button
+                    className="btn btn-neutral w-8 h-8 p-0 rounded-full"
+                    onClick={onStop}
+                  >
+                    <StopIcon className="h-5 w-5" />
+                  </button>
+                ) : (
+                  <button
+                    className="btn btn-primary w-8 h-8 p-0 rounded-full"
+                    onClick={onSend}
+                  >
+                    <ArrowUpIcon className="h-5 w-5" />
+                  </button>
+                )}
+              </div>
+            </div>
+          </div>
+        )}
+      </Dropzone>
+    </div>
+  );
+}
diff --git a/tools/server/webui/src/components/Header.tsx b/tools/server/webui/src/components/Header.tsx
index 4c6b291e6..45775ff7a 100644
--- a/tools/server/webui/src/components/Header.tsx
+++ b/tools/server/webui/src/components/Header.tsx
@@ -4,10 +4,13 @@ import { useAppContext } from '../utils/app.context';
 import { classNames } from '../utils/misc';
 import daisyuiThemes from 'daisyui/theme/object';
 import { THEMES } from '../Config';
-import { useNavigate } from 'react-router';
+import {
+  Cog8ToothIcon,
+  MoonIcon,
+  Bars3Icon,
+} from '@heroicons/react/24/outline';
 
 export default function Header() {
-  const navigate = useNavigate();
   const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
   const { setShowSettings } = useAppContext();
 
@@ -24,105 +27,21 @@ export default function Header() {
     );
   }, [selectedTheme]);
 
-  const { isGenerating, viewingChat } = useAppContext();
-  const isCurrConvGenerating = isGenerating(viewingChat?.conv.id ?? '');
-
-  const removeConversation = () => {
-    if (isCurrConvGenerating || !viewingChat) return;
-    const convId = viewingChat?.conv.id;
-    if (window.confirm('Are you sure to delete this conversation?')) {
-      StorageUtils.remove(convId);
-      navigate('/');
-    }
-  };
-
-  const downloadConversation = () => {
-    if (isCurrConvGenerating || !viewingChat) return;
-    const convId = viewingChat?.conv.id;
-    const conversationJson = JSON.stringify(viewingChat, null, 2);
-    const blob = new Blob([conversationJson], { type: 'application/json' });
-    const url = URL.createObjectURL(blob);
-    const a = document.createElement('a');
-    a.href = url;
-    a.download = `conversation_${convId}.json`;
-    document.body.appendChild(a);
-    a.click();
-    document.body.removeChild(a);
-    URL.revokeObjectURL(url);
-  };
-
   return (
     <div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
       {/* open sidebar button */}
       <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
-        <svg
-          xmlns="http://www.w3.org/2000/svg"
-          width="16"
-          height="16"
-          fill="currentColor"
-          className="bi bi-list"
-          viewBox="0 0 16 16"
-        >
-          <path
-            fillRule="evenodd"
-            d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
-          />
-        </svg>
+        <Bars3Icon className="h-5 w-5" />
       </label>
 
       <div className="grow text-2xl font-bold ml-2">llama.cpp</div>
 
       {/* action buttons (top right) */}
       <div className="flex items-center">
-        {viewingChat && (
-          <div className="dropdown dropdown-end">
-            {/* "..." button */}
-            <button
-              tabIndex={0}
-              role="button"
-              className="btn m-1"
-              disabled={isCurrConvGenerating}
-            >
-              <svg
-                xmlns="http://www.w3.org/2000/svg"
-                width="16"
-                height="16"
-                fill="currentColor"
-                className="bi bi-three-dots-vertical"
-                viewBox="0 0 16 16"
-              >
-                <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
-              </svg>
-            </button>
-            {/* dropdown menu */}
-            <ul
-              tabIndex={0}
-              className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
-            >
-              <li onClick={downloadConversation}>
-                <a>Download</a>
-              </li>
-              <li className="text-error" onClick={removeConversation}>
-                <a>Delete</a>
-              </li>
-            </ul>
-          </div>
-        )}
-
         <div className="tooltip tooltip-bottom" data-tip="Settings">
           <button className="btn" onClick={() => setShowSettings(true)}>
             {/* settings button */}
-            <svg
-              xmlns="http://www.w3.org/2000/svg"
-              width="16"
-              height="16"
-              fill="currentColor"
-              className="bi bi-gear"
-              viewBox="0 0 16 16"
-            >
-              <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
-              <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
-            </svg>
+            <Cog8ToothIcon className="w-5 h-5" />
           </button>
         </div>
 
@@ -130,16 +49,7 @@ export default function Header() {
         <div className="tooltip tooltip-bottom" data-tip="Themes">
           <div className="dropdown dropdown-end dropdown-bottom">
             <div tabIndex={0} role="button" className="btn m-1">
-              <svg
-                xmlns="http://www.w3.org/2000/svg"
-                width="16"
-                height="16"
-                fill="currentColor"
-                className="bi bi-palette2"
-                viewBox="0 0 16 16"
-              >
-                <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
-              </svg>
+              <MoonIcon className="w-5 h-5" />
             </div>
             <ul
               tabIndex={0}
diff --git a/tools/server/webui/src/components/MarkdownDisplay.tsx b/tools/server/webui/src/components/MarkdownDisplay.tsx
index 5b7a72591..380dbc570 100644
--- a/tools/server/webui/src/components/MarkdownDisplay.tsx
+++ b/tools/server/webui/src/components/MarkdownDisplay.tsx
@@ -11,6 +11,8 @@ import { ElementContent, Root } from 'hast';
 import { visit } from 'unist-util-visit';
 import { useAppContext } from '../utils/app.context';
 import { CanvasType } from '../utils/types';
+import { BtnWithTooltips } from '../utils/common';
+import { DocumentDuplicateIcon, PlayIcon } from '@heroicons/react/24/outline';
 
 export default function MarkdownDisplay({
   content,
@@ -81,10 +83,13 @@ const CodeBlockButtons: React.ElementType<
         'display-none': !node?.position,
       })}
     >
-      <CopyButton className="badge btn-mini" content={copiedContent} />
+      <CopyButton
+        className="badge btn-mini btn-soft shadow-sm"
+        content={copiedContent}
+      />
       {canRunCode && (
         <RunPyCodeButton
-          className="badge btn-mini ml-2"
+          className="badge btn-mini shadow-sm ml-2"
           content={copiedContent}
         />
       )}
@@ -101,16 +106,17 @@ export const CopyButton = ({
 }) => {
   const [copied, setCopied] = useState(false);
   return (
-    <button
+    <BtnWithTooltips
       className={className}
       onClick={() => {
         copyStr(content);
         setCopied(true);
       }}
       onMouseLeave={() => setCopied(false)}
+      tooltipsContent={copied ? 'Copied!' : 'Copy'}
     >
-      {copied ? 'Copied!' : '📋 Copy'}
-    </button>
+      <DocumentDuplicateIcon className="h-4 w-4" />
+    </BtnWithTooltips>
   );
 };
 
@@ -124,7 +130,7 @@ export const RunPyCodeButton = ({
   const { setCanvasData } = useAppContext();
   return (
     <>
-      <button
+      <BtnWithTooltips
         className={className}
         onClick={() =>
           setCanvasData({
@@ -132,9 +138,10 @@ export const RunPyCodeButton = ({
             content,
           })
         }
+        tooltipsContent="Run code"
       >
-        ▶️ Run
-      </button>
+        <PlayIcon className="h-4 w-4" />
+      </BtnWithTooltips>
     </>
   );
 };
diff --git a/tools/server/webui/src/components/Sidebar.tsx b/tools/server/webui/src/components/Sidebar.tsx
index 34727c623..1a6c8a327 100644
--- a/tools/server/webui/src/components/Sidebar.tsx
+++ b/tools/server/webui/src/components/Sidebar.tsx
@@ -1,13 +1,25 @@
-import { useEffect, useState } from 'react';
+import { useEffect, useMemo, useState } from 'react';
 import { classNames } from '../utils/misc';
 import { Conversation } from '../utils/types';
 import StorageUtils from '../utils/storage';
 import { useNavigate, useParams } from 'react-router';
+import {
+  ArrowDownTrayIcon,
+  EllipsisVerticalIcon,
+  PencilIcon,
+  TrashIcon,
+  XMarkIcon,
+} from '@heroicons/react/24/outline';
+import { BtnWithTooltips } from '../utils/common';
+import { useAppContext } from '../utils/app.context';
+import toast from 'react-hot-toast';
 
 export default function Sidebar() {
   const params = useParams();
   const navigate = useNavigate();
 
+  const { isGenerating } = useAppContext();
+
   const [conversations, setConversations] = useState<Conversation[]>([]);
   const [currConv, setCurrConv] = useState<Conversation | null>(null);
 
@@ -26,6 +38,11 @@ export default function Sidebar() {
     };
   }, []);
 
+  const groupedConv = useMemo(
+    () => groupConversationsByDate(conversations),
+    [conversations]
+  );
+
   return (
     <>
       <input
@@ -47,46 +64,96 @@ export default function Sidebar() {
 
             {/* close sidebar button */}
             <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
-              <svg
-                xmlns="http://www.w3.org/2000/svg"
-                width="16"
-                height="16"
-                fill="currentColor"
-                className="bi bi-arrow-bar-left"
-                viewBox="0 0 16 16"
-              >
-                <path
-                  fillRule="evenodd"
-                  d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
-                />
-              </svg>
+              <XMarkIcon className="w-5 h-5" />
             </label>
           </div>
 
-          {/* list of conversations */}
+          {/* new conversation button */}
           <div
             className={classNames({
-              'btn btn-ghost justify-start': true,
-              'btn-active': !currConv,
+              'btn btn-ghost justify-start px-2': true,
+              'btn-soft': !currConv,
             })}
             onClick={() => navigate('/')}
           >
             + New conversation
           </div>
-          {conversations.map((conv) => (
-            <div
-              key={conv.id}
-              className={classNames({
-                'btn btn-ghost justify-start font-normal': true,
-                'btn-active': conv.id === currConv?.id,
-              })}
-              onClick={() => navigate(`/chat/${conv.id}`)}
-              dir="auto"
-            >
-              <span className="truncate">{conv.name}</span>
+
+          {/* list of conversations */}
+          {groupedConv.map((group) => (
+            <div>
+              {/* group name (by date) */}
+              {group.title ? (
+                <b className="block text-xs px-2 mb-2 mt-6">{group.title}</b>
+              ) : (
+                <div className="h-2" />
+              )}
+
+              {group.conversations.map((conv) => (
+                <ConversationItem
+                  key={conv.id}
+                  conv={conv}
+                  isCurrConv={currConv?.id === conv.id}
+                  onSelect={() => {
+                    navigate(`/chat/${conv.id}`);
+                  }}
+                  onDelete={() => {
+                    if (isGenerating(conv.id)) {
+                      toast.error(
+                        'Cannot delete conversation while generating'
+                      );
+                      return;
+                    }
+                    if (
+                      window.confirm(
+                        'Are you sure to delete this conversation?'
+                      )
+                    ) {
+                      toast.success('Conversation deleted');
+                      StorageUtils.remove(conv.id);
+                      navigate('/');
+                    }
+                  }}
+                  onDownload={() => {
+                    if (isGenerating(conv.id)) {
+                      toast.error(
+                        'Cannot download conversation while generating'
+                      );
+                      return;
+                    }
+                    const conversationJson = JSON.stringify(conv, null, 2);
+                    const blob = new Blob([conversationJson], {
+                      type: 'application/json',
+                    });
+                    const url = URL.createObjectURL(blob);
+                    const a = document.createElement('a');
+                    a.href = url;
+                    a.download = `conversation_${conv.id}.json`;
+                    document.body.appendChild(a);
+                    a.click();
+                    document.body.removeChild(a);
+                    URL.revokeObjectURL(url);
+                  }}
+                  onRename={() => {
+                    if (isGenerating(conv.id)) {
+                      toast.error(
+                        'Cannot rename conversation while generating'
+                      );
+                      return;
+                    }
+                    const newName = window.prompt(
+                      'Enter new name for the conversation',
+                      conv.name
+                    );
+                    if (newName && newName.trim().length > 0) {
+                      StorageUtils.updateConversationName(conv.id, newName);
+                    }
+                  }}
+                />
+              ))}
             </div>
           ))}
-          <div className="text-center text-xs opacity-40 mt-auto mx-4">
+          <div className="text-center text-xs opacity-40 mt-auto mx-4 pt-8">
             Conversations are saved to browser's IndexedDB
           </div>
         </div>
@@ -94,3 +161,170 @@ export default function Sidebar() {
     </>
   );
 }
+
+function ConversationItem({
+  conv,
+  isCurrConv,
+  onSelect,
+  onDelete,
+  onDownload,
+  onRename,
+}: {
+  conv: Conversation;
+  isCurrConv: boolean;
+  onSelect: () => void;
+  onDelete: () => void;
+  onDownload: () => void;
+  onRename: () => void;
+}) {
+  return (
+    <div
+      className={classNames({
+        'group flex flex-row btn btn-ghost justify-start items-center font-normal px-2 h-9':
+          true,
+        'btn-soft': isCurrConv,
+      })}
+    >
+      <div
+        key={conv.id}
+        className="w-full overflow-hidden truncate text-start"
+        onClick={onSelect}
+        dir="auto"
+      >
+        {conv.name}
+      </div>
+      <div className="dropdown dropdown-end h-5">
+        <BtnWithTooltips
+          // on mobile, we always show the ellipsis icon
+          // on desktop, we only show it when the user hovers over the conversation item
+          // we use opacity instead of hidden to avoid layout shift
+          className="cursor-pointer opacity-100 md:opacity-0 group-hover:opacity-100"
+          onClick={() => {}}
+          tooltipsContent="More"
+        >
+          <EllipsisVerticalIcon className="w-5 h-5" />
+        </BtnWithTooltips>
+        {/* dropdown menu */}
+        <ul
+          tabIndex={0}
+          className="dropdown-content menu bg-base-100 rounded-box z-[1] p-2 shadow"
+        >
+          <li onClick={onRename}>
+            <a>
+              <PencilIcon className="w-4 h-4" />
+              Rename
+            </a>
+          </li>
+          <li onClick={onDownload}>
+            <a>
+              <ArrowDownTrayIcon className="w-4 h-4" />
+              Download
+            </a>
+          </li>
+          <li className="text-error" onClick={onDelete}>
+            <a>
+              <TrashIcon className="w-4 h-4" />
+              Delete
+            </a>
+          </li>
+        </ul>
+      </div>
+    </div>
+  );
+}
+
+// WARN: vibe code below
+
+export interface GroupedConversations {
+  title?: string;
+  conversations: Conversation[];
+}
+
+// TODO @ngxson : add test for this function
+// Group conversations by date
+// - "Previous 7 Days"
+// - "Previous 30 Days"
+// - "Month Year" (e.g., "April 2023")
+export function groupConversationsByDate(
+  conversations: Conversation[]
+): GroupedConversations[] {
+  const now = new Date();
+  const today = new Date(now.getFullYear(), now.getMonth(), now.getDate()); // Start of today
+
+  const sevenDaysAgo = new Date(today);
+  sevenDaysAgo.setDate(today.getDate() - 7);
+
+  const thirtyDaysAgo = new Date(today);
+  thirtyDaysAgo.setDate(today.getDate() - 30);
+
+  const groups: { [key: string]: Conversation[] } = {
+    Today: [],
+    'Previous 7 Days': [],
+    'Previous 30 Days': [],
+  };
+  const monthlyGroups: { [key: string]: Conversation[] } = {}; // Key format: "Month Year" e.g., "April 2023"
+
+  // Sort conversations by lastModified date in descending order (newest first)
+  // This helps when adding to groups, but the final output order of groups is fixed.
+  const sortedConversations = [...conversations].sort(
+    (a, b) => b.lastModified - a.lastModified
+  );
+
+  for (const conv of sortedConversations) {
+    const convDate = new Date(conv.lastModified);
+
+    if (convDate >= today) {
+      groups['Today'].push(conv);
+    } else if (convDate >= sevenDaysAgo) {
+      groups['Previous 7 Days'].push(conv);
+    } else if (convDate >= thirtyDaysAgo) {
+      groups['Previous 30 Days'].push(conv);
+    } else {
+      const monthName = convDate.toLocaleString('default', { month: 'long' });
+      const year = convDate.getFullYear();
+      const monthYearKey = `${monthName} ${year}`;
+      if (!monthlyGroups[monthYearKey]) {
+        monthlyGroups[monthYearKey] = [];
+      }
+      monthlyGroups[monthYearKey].push(conv);
+    }
+  }
+
+  const result: GroupedConversations[] = [];
+
+  if (groups['Today'].length > 0) {
+    result.push({
+      title: undefined, // no title for Today
+      conversations: groups['Today'],
+    });
+  }
+
+  if (groups['Previous 7 Days'].length > 0) {
+    result.push({
+      title: 'Previous 7 Days',
+      conversations: groups['Previous 7 Days'],
+    });
+  }
+
+  if (groups['Previous 30 Days'].length > 0) {
+    result.push({
+      title: 'Previous 30 Days',
+      conversations: groups['Previous 30 Days'],
+    });
+  }
+
+  // Sort monthly groups by date (most recent month first)
+  const sortedMonthKeys = Object.keys(monthlyGroups).sort((a, b) => {
+    const dateA = new Date(a); // "Month Year" can be parsed by Date constructor
+    const dateB = new Date(b);
+    return dateB.getTime() - dateA.getTime();
+  });
+
+  for (const monthKey of sortedMonthKeys) {
+    if (monthlyGroups[monthKey].length > 0) {
+      result.push({ title: monthKey, conversations: monthlyGroups[monthKey] });
+    }
+  }
+
+  return result;
+}
diff --git a/tools/server/webui/src/components/useChatExtraContext.tsx b/tools/server/webui/src/components/useChatExtraContext.tsx
new file mode 100644
index 000000000..866401db9
--- /dev/null
+++ b/tools/server/webui/src/components/useChatExtraContext.tsx
@@ -0,0 +1,174 @@
+import { useState } from 'react';
+import { MessageExtra } from '../utils/types';
+import toast from 'react-hot-toast';
+import { useAppContext } from '../utils/app.context';
+
+// Interface describing the API returned by the hook
+export interface ChatExtraContextApi {
+  items?: MessageExtra[]; // undefined if empty, similar to Message['extra']
+  addItems: (items: MessageExtra[]) => void;
+  removeItem: (idx: number) => void;
+  clearItems: () => void;
+  onFileAdded: (files: File[]) => void; // used by "upload" button
+}
+
+export function useChatExtraContext(): ChatExtraContextApi {
+  const { serverProps } = useAppContext();
+  const [items, setItems] = useState<MessageExtra[]>([]);
+
+  const addItems = (newItems: MessageExtra[]) => {
+    setItems((prev) => [...prev, ...newItems]);
+  };
+
+  const removeItem = (idx: number) => {
+    setItems((prev) => prev.filter((_, i) => i !== idx));
+  };
+
+  const clearItems = () => {
+    setItems([]);
+  };
+
+  const onFileAdded = (files: File[]) => {
+    for (const file of files) {
+      const mimeType = file.type;
+      console.debug({ mimeType, file });
+      if (file.size > 10 * 1024 * 1024) {
+        toast.error('File is too large. Maximum size is 10MB.');
+        break;
+      }
+
+      if (mimeType.startsWith('image/') && mimeType !== 'image/svg+xml') {
+        if (!serverProps?.has_multimodal) {
+          toast.error('Multimodal is not supported by this server or model.');
+          break;
+        }
+        const reader = new FileReader();
+        reader.onload = (event) => {
+          if (event.target?.result) {
+            addItems([
+              {
+                type: 'imageFile',
+                name: file.name,
+                base64Url: event.target.result as string,
+              },
+            ]);
+          }
+        };
+        reader.readAsDataURL(file);
+      } else if (
+        mimeType.startsWith('video/') ||
+        mimeType.startsWith('audio/')
+      ) {
+        toast.error('Video and audio files are not supported yet.');
+        break;
+      } else if (mimeType.startsWith('application/pdf')) {
+        toast.error('PDF files are not supported yet.');
+        break;
+      } else {
+        // Because there can be many text file types (like code file), we will not check the mime type
+        // and will just check if the file is not binary.
+        const reader = new FileReader();
+        reader.onload = (event) => {
+          if (event.target?.result) {
+            const content = event.target.result as string;
+            if (!isLikelyNotBinary(content)) {
+              toast.error('File is binary. Please upload a text file.');
+              return;
+            }
+            addItems([
+              {
+                type: 'textFile',
+                name: file.name,
+                content,
+              },
+            ]);
+          }
+        };
+        reader.readAsText(file);
+      }
+    }
+  };
+
+  return {
+    items: items.length > 0 ? items : undefined,
+    addItems,
+    removeItem,
+    clearItems,
+    onFileAdded,
+  };
+}
+
+// WARN: vibe code below
+// This code is a heuristic to determine if a string is likely not binary.
+// It is necessary because input file can have various mime types which we don't have time to investigate.
+// For example, a python file can be text/plain, application/x-python, etc.
+export function isLikelyNotBinary(str: string): boolean {
+  const options = {
+    prefixLength: 1024 * 10, // Check the first 10KB of the string
+    suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
+    maxAbsoluteNullBytes: 2,
+  };
+
+  if (!str) {
+    return true; // Empty string is considered "not binary" or trivially text.
+  }
+
+  const sampleLength = Math.min(str.length, options.prefixLength);
+  if (sampleLength === 0) {
+    return true; // Effectively an empty string after considering prefixLength.
+  }
+
+  let suspiciousCharCount = 0;
+  let nullByteCount = 0;
+
+  for (let i = 0; i < sampleLength; i++) {
+    const charCode = str.charCodeAt(i);
+
+    // 1. Check for Unicode Replacement Character (U+FFFD)
+    // This is a strong indicator if the string was created from decoding bytes as UTF-8.
+    if (charCode === 0xfffd) {
+      suspiciousCharCount++;
+      continue;
+    }
+
+    // 2. Check for Null Bytes (U+0000)
+    if (charCode === 0x0000) {
+      nullByteCount++;
+      // We also count nulls towards the general suspicious character count,
+      // as they are less common in typical text files.
+      suspiciousCharCount++;
+      continue;
+    }
+
+    // 3. Check for C0 Control Characters (U+0001 to U+001F)
+    // Exclude common text control characters: TAB (9), LF (10), CR (13).
+    // We can also be a bit lenient with BEL (7) and BS (8) which sometimes appear in logs.
+    if (charCode < 32) {
+      if (
+        charCode !== 9 && // TAB
+        charCode !== 10 && // LF
+        charCode !== 13 && // CR
+        charCode !== 7 && // BEL (Bell) - sometimes in logs
+        charCode !== 8 // BS (Backspace) - less common, but possible
+      ) {
+        suspiciousCharCount++;
+      }
+    }
+    // Characters from 32 (space) up to 126 (~) are printable ASCII.
+    // Characters 127 (DEL) is a control character.
+    // Characters >= 128 are extended ASCII / multi-byte Unicode.
+    // If they resulted in U+FFFD, we caught it. Otherwise, they are valid
+    // (though perhaps unusual) Unicode characters from JS's perspective.
+    // The main concern is if those higher characters came from misinterpreting
+    // a single-byte encoding as UTF-8, which again, U+FFFD would usually flag.
+  }
+
+  // Check absolute null byte count
+  if (nullByteCount > options.maxAbsoluteNullBytes) {
+    return false; // Too many null bytes is a strong binary indicator
+  }
+
+  // Check ratio of suspicious characters
+  const ratio = suspiciousCharCount / sampleLength;
+  return ratio <= options.suspiciousCharThresholdRatio;
+}
diff --git a/tools/server/webui/src/components/useChatScroll.tsx b/tools/server/webui/src/components/useChatScroll.tsx
new file mode 100644
index 000000000..25ea02234
--- /dev/null
+++ b/tools/server/webui/src/components/useChatScroll.tsx
@@ -0,0 +1,34 @@
+import React, { useEffect } from 'react';
+import { throttle } from '../utils/misc';
+
+export const scrollToBottom = (requiresNearBottom: boolean, delay?: number) => {
+  const mainScrollElem = document.getElementById('main-scroll');
+  if (!mainScrollElem) return;
+  const spaceToBottom =
+    mainScrollElem.scrollHeight -
+    mainScrollElem.scrollTop -
+    mainScrollElem.clientHeight;
+  if (!requiresNearBottom || spaceToBottom < 100) {
+    setTimeout(
+      () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
+      delay ?? 80
+    );
+  }
+};
+
+const scrollToBottomThrottled = throttle(scrollToBottom, 80);
+
+export function useChatScroll(msgListRef: React.RefObject<HTMLDivElement>) {
+  useEffect(() => {
+    if (!msgListRef.current) return;
+
+    const resizeObserver = new ResizeObserver((_) => {
+      scrollToBottomThrottled(true, 10);
+    });
+
+    resizeObserver.observe(msgListRef.current);
+    return () => {
+      resizeObserver.disconnect();
+    };
+  }, [msgListRef]);
+}
diff --git a/tools/server/webui/src/components/useChatTextarea.ts b/tools/server/webui/src/components/useChatTextarea.ts
index a3223f4fd..c2f865203 100644
--- a/tools/server/webui/src/components/useChatTextarea.ts
+++ b/tools/server/webui/src/components/useChatTextarea.ts
@@ -1,35 +1,39 @@
 import { useEffect, useRef, useState, useCallback } from 'react';
+import { throttle } from '../utils/misc';
 
 // Media Query for detecting "large" screens (matching Tailwind's lg: breakpoint)
 const LARGE_SCREEN_MQ = '(min-width: 1024px)';
 
 // Calculates and sets the textarea height based on its scrollHeight
-const adjustTextareaHeight = (textarea: HTMLTextAreaElement | null) => {
-  if (!textarea) return;
+const adjustTextareaHeight = throttle(
+  (textarea: HTMLTextAreaElement | null) => {
+    if (!textarea) return;
 
-  // Only perform auto-sizing on large screens
-  if (!window.matchMedia(LARGE_SCREEN_MQ).matches) {
-    // On small screens, reset inline height and max-height styles.
-    // This allows CSS (e.g., `rows` attribute or classes) to control the height,
-    // and enables manual resizing if `resize-vertical` is set.
-    textarea.style.height = ''; // Use 'auto' or '' to reset
-    textarea.style.maxHeight = '';
-    return; // Do not adjust height programmatically on small screens
-  }
+    // Only perform auto-sizing on large screens
+    if (!window.matchMedia(LARGE_SCREEN_MQ).matches) {
+      // On small screens, reset inline height and max-height styles.
+      // This allows CSS (e.g., `rows` attribute or classes) to control the height,
+      // and enables manual resizing if `resize-vertical` is set.
+      textarea.style.height = ''; // Use 'auto' or '' to reset
+      textarea.style.maxHeight = '';
+      return; // Do not adjust height programmatically on small screens
+    }
 
-  const computedStyle = window.getComputedStyle(textarea);
-  // Get the max-height specified by CSS (e.g., from `lg:max-h-48`)
-  const currentMaxHeight = computedStyle.maxHeight;
+    const computedStyle = window.getComputedStyle(textarea);
+    // Get the max-height specified by CSS (e.g., from `lg:max-h-48`)
+    const currentMaxHeight = computedStyle.maxHeight;
 
-  // Temporarily remove max-height to allow scrollHeight to be calculated correctly
-  textarea.style.maxHeight = 'none';
-  // Reset height to 'auto' to measure the actual scrollHeight needed
-  textarea.style.height = 'auto';
-  // Set the height to the calculated scrollHeight
-  textarea.style.height = `${textarea.scrollHeight}px`;
-  // Re-apply the original max-height from CSS to enforce the limit
-  textarea.style.maxHeight = currentMaxHeight;
-};
+    // Temporarily remove max-height to allow scrollHeight to be calculated correctly
+    textarea.style.maxHeight = 'none';
+    // Reset height to 'auto' to measure the actual scrollHeight needed
+    textarea.style.height = 'auto';
+    // Set the height to the calculated scrollHeight
+    textarea.style.height = `${textarea.scrollHeight}px`;
+    // Re-apply the original max-height from CSS to enforce the limit
+    textarea.style.maxHeight = currentMaxHeight;
+  },
+  100
+); // Throttle to prevent excessive calls
 
 // Interface describing the API returned by the hook
 export interface ChatTextareaApi {
@@ -65,6 +69,7 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
     }
   }, [textareaRef, savedInitValue]); // Depend on ref and savedInitValue
 
+  // On input change, we adjust the height of the textarea
   const handleInput = useCallback(
     (event: React.FormEvent<HTMLTextAreaElement>) => {
       // Call adjustTextareaHeight on every input - it will decide whether to act
@@ -94,6 +99,6 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
     },
     ref: textareaRef,
     refOnSubmit: onSubmitRef,
-    onInput: handleInput,
+    onInput: handleInput, // for adjusting height on input
   };
 }
diff --git a/tools/server/webui/src/index.scss b/tools/server/webui/src/index.scss
index a18f09454..563e7a461 100644
--- a/tools/server/webui/src/index.scss
+++ b/tools/server/webui/src/index.scss
@@ -22,12 +22,15 @@ html {
     all: revert;
   }
   pre {
-    @apply whitespace-pre-wrap rounded-lg p-2;
+    @apply whitespace-pre-wrap rounded-lg p-2 mb-3;
     border: 1px solid currentColor;
   }
   p {
     @apply mb-2;
   }
+  hr {
+    @apply my-4 border-base-content/20 border-1;
+  }
   /* TODO: fix markdown table */
 }
 
@@ -35,7 +38,7 @@ html {
   @apply md:opacity-0 md:group-hover:opacity-100;
 }
 .btn-mini {
-  @apply cursor-pointer hover:shadow-md;
+  @apply cursor-pointer;
 }
 .chat-screen {
   max-width: 900px;
diff --git a/tools/server/webui/src/utils/app.context.tsx b/tools/server/webui/src/utils/app.context.tsx
index 54bb65b6e..96cffd95a 100644
--- a/tools/server/webui/src/utils/app.context.tsx
+++ b/tools/server/webui/src/utils/app.context.tsx
@@ -3,6 +3,7 @@ import {
   APIMessage,
   CanvasData,
   Conversation,
+  LlamaCppServerProps,
   Message,
   PendingMessage,
   ViewingChat,
@@ -12,9 +13,11 @@ import {
   filterThoughtFromMsgs,
   normalizeMsgsForAPI,
   getSSEStreamAsync,
+  getServerProps,
 } from './misc';
 import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
 import { matchPath, useLocation, useNavigate } from 'react-router';
+import toast from 'react-hot-toast';
 
 interface AppContextValue {
   // conversations and messages
@@ -46,6 +49,9 @@ interface AppContextValue {
   saveConfig: (config: typeof CONFIG_DEFAULT) => void;
   showSettings: boolean;
   setShowSettings: (show: boolean) => void;
+
+  // props
+  serverProps: LlamaCppServerProps | null;
 }
 
 // this callback is used for scrolling to the bottom of the chat and switching to the last node
@@ -74,6 +80,9 @@ export const AppContextProvider = ({
   const params = matchPath('/chat/:convId', pathname);
   const convId = params?.params?.convId;
 
+  const [serverProps, setServerProps] = useState<LlamaCppServerProps | null>(
+    null
+  );
   const [viewingChat, setViewingChat] = useState<ViewingChat | null>(null);
   const [pendingMessages, setPendingMessages] = useState<
     Record<Conversation['id'], PendingMessage>
@@ -85,6 +94,20 @@ export const AppContextProvider = ({
   const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
   const [showSettings, setShowSettings] = useState(false);
 
+  // get server props
+  useEffect(() => {
+    getServerProps(BASE_URL, config.apiKey)
+      .then((props) => {
+        console.debug('Server props:', props);
+        setServerProps(props);
+      })
+      .catch((err) => {
+        console.error(err);
+        toast.error('Failed to fetch server props');
+      });
+    // eslint-disable-next-line
+  }, []);
+
   // handle change when the convId from URL is changed
   useEffect(() => {
     // also reset the canvas data
@@ -260,7 +283,7 @@ export const AppContextProvider = ({
       } else {
         console.error(err);
         // eslint-disable-next-line @typescript-eslint/no-explicit-any
-        alert((err as any)?.message ?? 'Unknown error');
+        toast.error((err as any)?.message ?? 'Unknown error');
         throw err; // rethrow
       }
     }
@@ -377,6 +400,7 @@ export const AppContextProvider = ({
         saveConfig,
         showSettings,
         setShowSettings,
+        serverProps,
       }}
     >
       {children}
diff --git a/tools/server/webui/src/utils/common.tsx b/tools/server/webui/src/utils/common.tsx
index 09b08b5c9..372f464a2 100644
--- a/tools/server/webui/src/utils/common.tsx
+++ b/tools/server/webui/src/utils/common.tsx
@@ -36,3 +36,32 @@ export const OpenInNewTab = ({
     {children}
   </a>
 );
+
+export function BtnWithTooltips({
+  className,
+  onClick,
+  onMouseLeave,
+  children,
+  tooltipsContent,
+  disabled,
+}: {
+  className?: string;
+  onClick: () => void;
+  onMouseLeave?: () => void;
+  children: React.ReactNode;
+  tooltipsContent: string;
+  disabled?: boolean;
+}) {
+  return (
+    <div className="tooltip tooltip-bottom" data-tip={tooltipsContent}>
+      <button
+        className={`${className ?? ''} flex items-center justify-center`}
+        onClick={onClick}
+        disabled={disabled}
+        onMouseLeave={onMouseLeave}
+      >
+        {children}
+      </button>
+    </div>
+  );
+}
diff --git a/tools/server/webui/src/utils/llama-vscode.ts b/tools/server/webui/src/utils/llama-vscode.ts
index 55ebdcffc..0ad8f8042 100644
--- a/tools/server/webui/src/utils/llama-vscode.ts
+++ b/tools/server/webui/src/utils/llama-vscode.ts
@@ -1,6 +1,6 @@
-import { useEffect, useState } from 'react';
-import { MessageExtraContext } from './types';
+import { useEffect } from 'react';
 import { ChatTextareaApi } from '../components/useChatTextarea.ts';
+import { ChatExtraContextApi } from '../components/useChatExtraContext.tsx';
 
 // Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
 // Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,11 +15,10 @@ interface SetTextEvData {
  * window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n  return 123' }, '*');
  */
 
-export const useVSCodeContext = (textarea: ChatTextareaApi) => {
-  const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
-    null
-  );
-
+export const useVSCodeContext = (
+  textarea: ChatTextareaApi,
+  extraContext: ChatExtraContextApi
+) => {
   // Accept setText message from a parent window and set inputMsg and extraContext
   useEffect(() => {
     const handleMessage = (event: MessageEvent) => {
@@ -27,10 +26,14 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
         const data: SetTextEvData = event.data;
         textarea.setValue(data?.text);
         if (data?.context && data.context.length > 0) {
-          setExtraContext({
-            type: 'context',
-            content: data.context,
-          });
+          extraContext.clearItems();
+          extraContext.addItems([
+            {
+              type: 'context',
+              name: 'Extra context',
+              content: data.context,
+            },
+          ]);
         }
         textarea.focus();
         setTimeout(() => {
@@ -41,7 +44,7 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
 
     window.addEventListener('message', handleMessage);
     return () => window.removeEventListener('message', handleMessage);
-  }, [textarea]);
+  }, [textarea, extraContext]);
 
   // Add a keydown listener that sends the "escapePressed" message to the parent window
   useEffect(() => {
@@ -55,9 +58,5 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
     return () => window.removeEventListener('keydown', handleKeyDown);
   }, []);
 
-  return {
-    extraContext,
-    // call once the user message is sent, to clear the extra context
-    clearExtraContext: () => setExtraContext(null),
-  };
+  return {};
 };
diff --git a/tools/server/webui/src/utils/misc.ts b/tools/server/webui/src/utils/misc.ts
index 87f55b2af..ba760e83b 100644
--- a/tools/server/webui/src/utils/misc.ts
+++ b/tools/server/webui/src/utils/misc.ts
@@ -1,6 +1,11 @@
 // @ts-expect-error this package does not have typing
 import TextLineStream from 'textlinestream';
-import { APIMessage, Message } from './types';
+import {
+  APIMessage,
+  APIMessageContentPart,
+  LlamaCppServerProps,
+  Message,
+} from './types';
 
 // ponyfill for missing ReadableStream asyncIterator on Safari
 import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
@@ -57,19 +62,47 @@ export const copyStr = (textToCopy: string) => {
  */
 export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
   return messages.map((msg) => {
-    let newContent = '';
+    if (msg.role !== 'user' || !msg.extra) {
+      return {
+        role: msg.role,
+        content: msg.content,
+      } as APIMessage;
+    }
+
+    // extra content first, then user text message in the end
+    // this allow re-using the same cache prefix for long context
+    const contentArr: APIMessageContentPart[] = [];
 
     for (const extra of msg.extra ?? []) {
       if (extra.type === 'context') {
-        newContent += `${extra.content}\n\n`;
+        contentArr.push({
+          type: 'text',
+          text: extra.content,
+        });
+      } else if (extra.type === 'textFile') {
+        contentArr.push({
+          type: 'text',
+          text: `File: ${extra.name}\nContent:\n\n${extra.content}`,
+        });
+      } else if (extra.type === 'imageFile') {
+        contentArr.push({
+          type: 'image_url',
+          image_url: { url: extra.base64Url },
+        });
+      } else {
+        throw new Error('Unknown extra type');
       }
     }
 
-    newContent += msg.content;
+    // add user message to the end
+    contentArr.push({
+      type: 'text',
+      text: msg.content,
+    });
 
     return {
       role: msg.role,
-      content: newContent,
+      content: contentArr,
     };
   }) as APIMessage[];
 }
@@ -78,13 +111,19 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
  * recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
  */
 export function filterThoughtFromMsgs(messages: APIMessage[]) {
+  console.debug({ messages });
   return messages.map((msg) => {
+    if (msg.role !== 'assistant') {
+      return msg;
+    }
+    // assistant message is always a string
+    const contentStr = msg.content as string;
     return {
       role: msg.role,
       content:
         msg.role === 'assistant'
-          ? msg.content.split('</think>').at(-1)!.trim()
-          : msg.content,
+          ? contentStr.split('</think>').at(-1)!.trim()
+          : contentStr,
     } as APIMessage;
   });
 }
@@ -126,3 +165,25 @@ export const cleanCurrentUrl = (removeQueryParams: string[]) => {
   });
   window.history.replaceState({}, '', url.toString());
 };
+
+export const getServerProps = async (
+  baseUrl: string,
+  apiKey?: string
+): Promise<LlamaCppServerProps> => {
+  try {
+    const response = await fetch(`${baseUrl}/props`, {
+      headers: {
+        'Content-Type': 'application/json',
+        ...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
+      },
+    });
+    if (!response.ok) {
+      throw new Error('Failed to fetch server props');
+    }
+    const data = await response.json();
+    return data as LlamaCppServerProps;
+  } catch (error) {
+    console.error('Error fetching server props:', error);
+    throw error;
+  }
+};
diff --git a/tools/server/webui/src/utils/storage.ts b/tools/server/webui/src/utils/storage.ts
index 1dfc9d979..505693e92 100644
--- a/tools/server/webui/src/utils/storage.ts
+++ b/tools/server/webui/src/utils/storage.ts
@@ -116,6 +116,16 @@ const StorageUtils = {
     });
     return conv;
   },
+  /**
+   * update the name of a conversation
+   */
+  async updateConversationName(convId: string, name: string): Promise<void> {
+    await db.conversations.update(convId, {
+      name,
+      lastModified: Date.now(),
+    });
+    dispatchConversationChange(convId);
+  },
   /**
    * if convId does not exist, throw an error
    */
diff --git a/tools/server/webui/src/utils/types.ts b/tools/server/webui/src/utils/types.ts
index 0eb774001..add48be4c 100644
--- a/tools/server/webui/src/utils/types.ts
+++ b/tools/server/webui/src/utils/types.ts
@@ -48,7 +48,10 @@ export interface Message {
   children: Message['id'][];
 }
 
-type MessageExtra = MessageExtraTextFile | MessageExtraContext; // TODO: will add more in the future
+export type MessageExtra =
+  | MessageExtraTextFile
+  | MessageExtraImageFile
+  | MessageExtraContext;
 
 export interface MessageExtraTextFile {
   type: 'textFile';
@@ -56,12 +59,32 @@ export interface MessageExtraTextFile {
   content: string;
 }
 
+export interface MessageExtraImageFile {
+  type: 'imageFile';
+  name: string;
+  base64Url: string;
+}
+
 export interface MessageExtraContext {
   type: 'context';
+  name: string;
   content: string;
 }
 
-export type APIMessage = Pick<Message, 'role' | 'content'>;
+export type APIMessageContentPart =
+  | {
+      type: 'text';
+      text: string;
+    }
+  | {
+      type: 'image_url';
+      image_url: { url: string };
+    };
+
+export type APIMessage = {
+  role: Message['role'];
+  content: string | APIMessageContentPart[];
+};
 
 export interface Conversation {
   id: string; // format: `conv-{timestamp}`
@@ -89,3 +112,12 @@ export interface CanvasPyInterpreter {
 }
 
 export type CanvasData = CanvasPyInterpreter;
+
+// a non-complete list of props, only contains the ones we need
+export interface LlamaCppServerProps {
+  build_info: string;
+  model_path: string;
+  n_ctx: number;
+  has_multimodal: boolean;
+  // TODO: support params
+}
diff --git a/tools/server/webui/vite.config.ts b/tools/server/webui/vite.config.ts
index b8a0f03d9..366df3b75 100644
--- a/tools/server/webui/vite.config.ts
+++ b/tools/server/webui/vite.config.ts
@@ -71,6 +71,7 @@ export default defineConfig({
   server: {
     proxy: {
       '/v1': 'http://localhost:8080',
+      '/props': 'http://localhost:8080',
     },
     headers: {
       'Cross-Origin-Embedder-Policy': 'require-corp',