diff --git a/.github/actions/get-tag-name/action.yml b/.github/actions/get-tag-name/action.yml new file mode 100644 index 000000000..7ace23b2a --- /dev/null +++ b/.github/actions/get-tag-name/action.yml @@ -0,0 +1,22 @@ +name: "Determine tag name" +description: "Determine the tag name to use for a release" +outputs: + name: + description: "The name of the tag" + value: ${{ steps.tag.outputs.name }} + +runs: + using: "composite" + steps: + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml new file mode 100644 index 000000000..5575caeca --- /dev/null +++ b/.github/actions/windows-setup-cuda/action.yml @@ -0,0 +1,67 @@ +name: "Windows - Setup CUDA Toolkit" +description: "Setup CUDA Toolkit for Windows" +inputs: + cuda_version: + description: "CUDA toolkit version" + required: true + +runs: + using: "composite" + steps: + - name: Install Cuda Toolkit 11.7 + if: ${{ inputs.cuda_version == '11.7' }} + shell: pwsh + run: | + mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" + choco install unzip -y + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip" + unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + + - name: Install Cuda Toolkit 12.4 + if: ${{ inputs.cuda_version == '12.4' }} + shell: pwsh + run: | + mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + choco install unzip -y + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip" + unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 000000000..38c8ea439 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,710 @@ +name: Create Release + +on: + workflow_dispatch: # allows manual triggering + inputs: + create_release: + description: 'Create new release' + required: true + type: boolean + push: + branches: + - master + paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +# Fine-grant permission +# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token +permissions: + contents: write # for creating release + +env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON" + +jobs: + macOS-arm64: + runs-on: macos-14 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: macOS-latest-cmake-arm64 + evict-old-files: 1d + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + brew install curl + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build \ + -DCMAKE_BUILD_RPATH="@loader_path" \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DGGML_RPC=ON \ + ${{ env.CMAKE_ARGS }} + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + cp LICENSE ./build/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip + name: llama-bin-macos-arm64.zip + + macOS-x64: + runs-on: macos-13 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: macOS-latest-cmake-x64 + evict-old-files: 1d + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + brew install curl + + - name: Build + id: cmake_build + run: | + sysctl -a + # Metal is disabled due to intermittent failures with Github runners not having a GPU: + # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 + cmake -B build \ + -DCMAKE_BUILD_RPATH="@loader_path" \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DGGML_METAL=OFF \ + -DGGML_RPC=ON + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + cp LICENSE ./build/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip + name: llama-bin-macos-x64.zip + + ubuntu-22-cpu: + strategy: + matrix: + include: + - build: 'x64' + os: ubuntu-22.04 + - build: 'arm64' + os: ubuntu-22.04-arm + + runs-on: ${{ matrix.os }} + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: ubuntu-cpu-cmake + evict-old-files: 1d + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential libcurl4-openssl-dev + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + ${{ env.CMAKE_ARGS }} + cmake --build build --config Release -j $(nproc) + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + cp LICENSE ./build/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip + name: llama-bin-ubuntu-${{ matrix.build }}.zip + + ubuntu-22-vulkan: + runs-on: ubuntu-22.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: ubuntu-22-cmake-vulkan + evict-old-files: 1d + + - name: Dependencies + id: depends + run: | + wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - + sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + sudo apt-get update -y + sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DGGML_VULKAN=ON \ + ${{ env.CMAKE_ARGS }} + cmake --build build --config Release -j $(nproc) + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + cp LICENSE ./build/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip + name: llama-bin-ubuntu-vulkan-x64.zip + + windows: + runs-on: windows-latest + + env: + OPENBLAS_VERSION: 0.3.23 + VULKAN_VERSION: 1.4.309.0 + + strategy: + matrix: + include: + - build: 'cpu-x64' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF' + #- build: 'openblas-x64' + # defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + - build: 'vulkan-x64' + defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' + - build: 'cpu-arm64' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF' + - build: 'opencl-adreno-arm64' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: windows-latest-cmake-${{ matrix.build }} + variant: ccache + evict-old-files: 1d + + - name: Download OpenBLAS + id: get_openblas + if: ${{ matrix.build == 'openblas-x64' }} + run: | + curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" + curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" + mkdir $env:RUNNER_TEMP/openblas + tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas + $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) + $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) + $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') + & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll + + - name: Install Vulkan SDK + id: get_vulkan + if: ${{ matrix.build == 'vulkan-x64' }} + run: | + curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" + & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install + Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" + Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" + + - name: Install Ninja + id: install_ninja + run: | + choco install ninja + + - name: Install OpenCL Headers and Libs + id: install_opencl + if: ${{ matrix.build == 'opencl-adreno-arm64' }} + run: | + git clone https://github.com/KhronosGroup/OpenCL-Headers + cd OpenCL-Headers + cmake -B build ` + -DBUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build build --target install + git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader + cd OpenCL-ICD-Loader + cmake -B build-arm64-release ` + -A arm64 ` + -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build build-arm64-release --target install --config release + + - name: libCURL + id: get_libcurl + uses: ./.github/actions/windows-setup-curl + + - name: Build + id: cmake_build + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} + run: | + cmake -S . -B build ${{ matrix.defines }} ` + -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" ` + ${{ env.CMAKE_ARGS }} + cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} + + - name: Add libopenblas.dll + id: add_libopenblas_dll + if: ${{ matrix.build == 'openblas-x64' }} + run: | + cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll + cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} + run: | + Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll + 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip + name: llama-bin-win-${{ matrix.build }}.zip + + windows-cuda: + runs-on: windows-2019 + + strategy: + matrix: + cuda: ['12.4', '11.7'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: windows-cuda-${{ matrix.cuda }} + variant: ccache + evict-old-files: 1d + + - name: Install Cuda Toolkit + uses: ./.github/actions/windows-setup-cuda + with: + cuda_version: ${{ matrix.cuda }} + + - name: Install Ninja + id: install_ninja + run: | + choco install ninja + + - name: libCURL + id: get_libcurl + uses: ./.github/actions/windows-setup-curl + + - name: Build + id: cmake_build + shell: cmd + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} + run: | + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + cmake -S . -B build -G "Ninja Multi-Config" ^ + -DGGML_NATIVE=OFF ^ + -DGGML_BACKEND_DL=ON ^ + -DGGML_CPU_ALL_VARIANTS=ON ^ + -DGGML_CUDA=ON ^ + -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^ + ${{ env.CMAKE_ARGS }} + set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 + cmake --build build --config Release -j %NINJA_JOBS% -t ggml + cmake --build build --config Release + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} + run: | + cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll + 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip + name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip + + - name: Copy and pack Cuda runtime + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + run: | + echo "Cuda install location: ${{ env.CUDA_PATH }}" + $dst='.\build\bin\cudart\' + robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll + robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll + 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* + + - name: Upload Cuda runtime + uses: actions/upload-artifact@v4 + with: + path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + + windows-sycl: + runs-on: windows-latest + + defaults: + run: + shell: bash + + env: + WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel + ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: windows-latest-cmake-sycl + variant: ccache + evict-old-files: 1d + + - name: Install + run: | + scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + + # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args + + - name: Build + id: cmake_build + run: examples/sycl/win-build-sycl.bat + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Build the release package + id: pack_artifacts + run: | + echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" + + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin + + echo "cp oneAPI running time dll files to ./build/bin done" + 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* + + - name: Upload the release package + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip + name: llama-bin-win-sycl-x64.zip + + windows-hip: + runs-on: windows-latest + + strategy: + matrix: + gpu_target: [gfx1100, gfx1101, gfx1030] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Clone rocWMMA repository + id: clone_rocwmma + run: | + git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: windows-latest-cmake-hip-release + evict-old-files: 1d + + - name: Install + id: depends + run: | + $ErrorActionPreference = "Stop" + write-host "Downloading AMD HIP SDK Installer" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + write-host "Installing AMD HIP SDK" + Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait + write-host "Completed AMD HIP SDK installation" + + - name: Verify ROCm + id: verify + run: | + & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + + - name: libCURL + id: get_libcurl + uses: ./.github/actions/windows-setup-curl + + - name: Build + id: cmake_build + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} + run: | + $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) + $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" + cmake -G "Unix Makefiles" -B build -S . ` + -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` + -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` + -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" ` + -DCMAKE_BUILD_TYPE=Release ` + -DAMDGPU_TARGETS=${{ matrix.gpu_target }} ` + -DGGML_HIP_ROCWMMA_FATTN=ON ` + -DGGML_HIP=ON ` + -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" ` + ${{ env.CMAKE_ARGS }} + cmake --build build -j ${env:NUMBER_OF_PROCESSORS} + md "build\bin\rocblas\library\" + cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} + run: | + cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll + 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip + name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip + + ios-xcode-build: + runs-on: macos-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_CURL=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + + - name: xcodebuild for swift package + id: xcodebuild + run: | + ./build-xcframework.sh + + - name: Build Xcode project + run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-xcframework.zip + name: llama-${{ steps.tag.outputs.name }}-xcframework + + release: + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + + runs-on: ubuntu-latest + + needs: + - ubuntu-22-cpu + - ubuntu-22-vulkan + - windows + - windows-cuda + - windows-sycl + - windows-hip + - macOS-arm64 + - macOS-x64 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Download artifacts + id: download-artifact + uses: actions/download-artifact@v4 + with: + path: ./artifact + + - name: Move artifacts + id: move_artifacts + run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release + + - name: Create release + id: create_release + uses: ggml-org/action-create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ steps.tag.outputs.name }} + + - name: Upload release + id: upload_release + uses: actions/github-script@v3 + with: + github-token: ${{secrets.GITHUB_TOKEN}} + script: | + const path = require('path'); + const fs = require('fs'); + const release_id = '${{ steps.create_release.outputs.id }}'; + for (let file of await fs.readdirSync('./artifact/release')) { + if (path.extname(file) === '.zip') { + console.log('uploadReleaseAsset', file); + await github.repos.uploadReleaseAsset({ + owner: context.repo.owner, + repo: context.repo.repo, + release_id: release_id, + name: file, + data: await fs.readFileSync(`./artifact/release/${file}`) + }); + } + } diff --git a/common/arg.cpp b/common/arg.cpp index bbb70ccbe..d41c66611 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2098,13 +2098,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cache_type_v = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); - add_opt(common_arg( - {"--perplexity", "--all-logits"}, - string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](common_params & params) { - params.logits_all = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", diff --git a/common/chat.cpp b/common/chat.cpp index bbc5f087c..ad3d4aa99 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -125,7 +125,9 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa msgs.push_back(msg); } } catch (const std::exception & e) { - throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2)); + // @ngxson : disable otherwise it's bloating the API response + // printf("%s\n", std::string("; messages = ") + messages.dump(2)); + throw std::runtime_error("Failed to parse messages: " + std::string(e.what())); } return msgs; diff --git a/common/common.cpp b/common/common.cpp index ea71ec104..21ad0a8eb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1103,7 +1103,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.n_threads = params.cpuparams.n_threads; cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? params.cpuparams.n_threads : params.cpuparams_batch.n_threads; - cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; cparams.rope_freq_base = params.rope_freq_base; diff --git a/common/common.h b/common/common.h index ff6225389..dfaaa6026 100644 --- a/common/common.h +++ b/common/common.h @@ -320,7 +320,6 @@ struct common_params { bool ctx_shift = true; // context shift on inifinite text generation bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a6aaf8834..bf6bc6838 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1388,10 +1388,10 @@ class BaichuanModel(TextModel): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: head_count = self.hparams["num_attention_heads"] @@ -1512,10 +1512,10 @@ class XverseModel(TextModel): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -1828,10 +1828,10 @@ class LlamaModel(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): @@ -2206,10 +2206,10 @@ class DeciModel(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): @@ -2449,10 +2449,10 @@ class MiniCPMModel(TextModel): logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] self.gguf_writer.add_logit_scale(logit_scale) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") - if self.hparams.get("rope_scaling") is not None: - if self.hparams["rope_scaling"].get("type") == "longrope": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) - logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) + logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] @@ -2597,11 +2597,11 @@ class Qwen2Model(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if self.hf_arch == "Qwen2Model": @@ -2763,11 +2763,11 @@ class Qwen2MoeModel(TextModel): logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") # YaRN is not enabled by default # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) _experts: list[dict[str, Tensor]] | None = None @@ -3035,7 +3035,7 @@ class Phi3MiniModel(TextModel): scale = max_pos_embds / orig_max_pos_embds - rope_scaling_type = rope_scaling.get('type', '').lower() + rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower() if len(rope_scaling_type) == 0: raise KeyError('Missing the required key rope_scaling.type') @@ -3347,10 +3347,10 @@ class InternLM2Model(TextModel): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_file_type(self.ftype) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] @@ -3425,10 +3425,10 @@ class InternLM3Model(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] @@ -4866,12 +4866,12 @@ class DeepseekV2Model(TextModel): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"]) _experts: list[dict[str, Tensor]] | None = None @@ -5363,11 +5363,11 @@ class Glm4Model(TextModel): super().set_gguf_parameters() rope_dim = self.hparams["head_dim"] self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) @ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") @@ -5600,10 +5600,10 @@ class ExaoneModel(TextModel): rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) rotary_factor = rotary_factor if rotary_factor is not None else 1.0 self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]: - if hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): @@ -5706,10 +5706,11 @@ class BailingMoeModel(TextModel): rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if (self.hparams.get("rope_scaling") or {}).get("type") == "yarn" and "factor" in self.hparams["rope_scaling"]: + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) else: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index a84a7ffde..594439ea5 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -559,7 +559,6 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll draft_model_params.use_mlock = base_model_params.use_mlock; draft_model_params.n_gpu_layers = draft_gpulayers; //layers offload the speculative model. draft_ctx_params.n_ctx = base_ctx_params.n_ctx; - draft_ctx_params.logits_all = false; draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv; draft_model_params.main_gpu = base_model_params.main_gpu; draft_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; @@ -2147,7 +2146,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } llama_ctx_params.offload_kqv = !inputs.low_vram; - llama_ctx_params.logits_all = false; model_params.use_mmap = inputs.use_mmap; model_params.use_mlock = inputs.use_mlock; model_params.n_gpu_layers = inputs.gpulayers; diff --git a/include/llama.h b/include/llama.h index 0d35441c6..eb881b035 100644 --- a/include/llama.h +++ b/include/llama.h @@ -353,19 +353,17 @@ extern "C" { enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] - // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. - // TODO: move at the end of the struct - bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) - bool embeddings; // if true, extract embeddings (together with logits) - bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - bool no_perf; // whether to measure performance timings - // Abort callback // if it returns true, execution of llama_decode() will be aborted // currently works only with CPU execution ggml_abort_callback abort_callback; void * abort_callback_data; + + // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. + bool embeddings; // if true, extract embeddings (together with logits) + bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool flash_attn; // whether to use flash attention [EXPERIMENTAL] + bool no_perf; // whether to measure performance timings }; // model quantization parameters @@ -926,14 +924,19 @@ extern "C" { // Frees a batch of tokens allocated with llama_batch_init() LLAMA_API void llama_batch_free(struct llama_batch batch); - // Processes a batch of tokens with the ecoder part of the encoder-decoder model. - // Stores the encoder output internally for later use by the decoder cross-attention layers. + // Process a batch of tokens. + // In contrast to llama_decode() - this call does not use KV cache. + // For encode-decoder contexts, processes the batch using the encoder. + // Can store the encoder output internally for later use by the decoder's cross-attention layers. // 0 - success // < 0 - error. the KV cache state is restored to the state before this call LLAMA_API int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch); + // Process a batch of tokens. + // Requires KV cache. + // For encode-decoder contexts, processes the batch using the decoder. // Positive return values does not mean a fatal error, but rather a warning. // 0 - success // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp index 428bbc633..6797a3d86 100644 --- a/otherarch/embeddings_adapter.cpp +++ b/otherarch/embeddings_adapter.cpp @@ -136,7 +136,6 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs) ctx_params.embeddings = true; ctx_params.n_ubatch = ctx_params.n_ubatch = max_batchsize; //max size, must fit ctx_params.n_ctx = max_batchsize; - ctx_params.logits_all = false; ctx_params.offload_kqv = true; ctx_params.n_threads = nthreads; ctx_params.n_threads_batch = nthreads; diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index 336d70e9f..57e50b319 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -533,7 +533,6 @@ bool ttstype_load_model(const tts_load_model_inputs inputs) tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; tts_ctx_params.n_ctx = 8192; - tts_ctx_params.logits_all = false; tts_ctx_params.offload_kqv = true; tts_ctx_params.n_batch = 8192; tts_ctx_params.n_ubatch = 512; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2fb16ec9e..9171d8ad6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -116,8 +116,6 @@ llama_context::llama_context( __func__, n_ctx_per_seq, hparams.n_ctx_train); } - logits_all = params.logits_all; - if (!hparams.vocab_only) { // GPU backends for (auto * dev : model.devices) { @@ -253,7 +251,7 @@ llama_context::llama_context( } // reserve worst-case graph - if (!hparams.vocab_only) { + if (!hparams.vocab_only && memory) { const uint32_t n_seqs = 1; // TODO: worst-case number of sequences const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); @@ -702,6 +700,8 @@ int llama_context::encode(llama_batch & inp_batch) { t_compute_start_us = ggml_time_us(); } + embd_seq.clear(); + n_queued_tokens += n_tokens; const int64_t n_embd = hparams.n_embd; @@ -763,12 +763,12 @@ int llama_context::encode(llama_batch & inp_batch) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); GGML_ASSERT(backend_embd != nullptr); - GGML_ASSERT(embd != nullptr); - switch (cparams.pooling_type) { case LLAMA_POOLING_TYPE_NONE: { // extract token embeddings + GGML_ASSERT(embd != nullptr); + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float)); } break; @@ -793,11 +793,18 @@ int llama_context::encode(llama_batch & inp_batch) { } break; case LLAMA_POOLING_TYPE_RANK: { - // TODO: this likely should be the same logic as in llama_decoder_internal, but better to - // wait for an encoder model that requires this pooling type in order to test it - // https://github.com/ggerganov/llama.cpp/pull/9510 - GGML_ABORT("RANK pooling not implemented yet"); - } + // extract the rerank score - a single float per sequence + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(1); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); + } + } break; case LLAMA_POOLING_TYPE_UNSPECIFIED: { GGML_ABORT("unknown pooling type"); @@ -835,6 +842,11 @@ int llama_context::encode(llama_batch & inp_batch) { } int llama_context::decode(llama_batch & inp_batch) { + if (!memory) { + LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__); + return encode(inp_batch); + } + if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; @@ -890,7 +902,7 @@ int llama_context::decode(llama_batch & inp_batch) { for (uint32_t i = 0; i < n_tokens_all; ++i) { n_outputs_all += batch.logits[i] != 0; } - } else if (logits_all || embd_pooled) { + } else if (embd_pooled) { n_outputs_all = n_tokens_all; } else { // keep last output only @@ -1853,13 +1865,12 @@ llama_context_params llama_context_default_params() { /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, - /*.logits_all =*/ false, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, - /*.abort_callback =*/ nullptr, - /*.abort_callback_data =*/ nullptr, }; return result; diff --git a/src/llama-context.h b/src/llama-context.h index cf41ac57b..5a080e67f 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -187,9 +187,6 @@ private: std::unique_ptr memory; - // TODO: remove - bool logits_all = false; - // decode output (2-dimensional array: [n_outputs][n_vocab]) size_t logits_size = 0; // capacity (of floats) for logits float * logits = nullptr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ae187b93f..2a5d2abd2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1666,8 +1666,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { std::regex pattern(overrides->pattern); if (std::regex_search(tensor_name, pattern)) { - LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); buft = overrides->buft; + LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", + tensor_name.c_str(), + ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type), + ggml_backend_buft_name(buft)); break; } } @@ -12952,6 +12955,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_memory_i * res; switch (arch) { + case LLM_ARCH_BERT: + case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: + { + res = nullptr; + } break; case LLM_ARCH_MAMBA: case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 0251edca5..d3c88c45e 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -100,14 +100,6 @@ int main(int argc, char ** argv) { console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); - if (params.logits_all) { - LOG_ERR("************\n"); - LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - LOG_ERR("************\n\n"); - - return 0; - } - if (params.embedding) { LOG_ERR("************\n"); LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d68b65b8b..1f86a8b2d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3151,7 +3151,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int n_per_side_2d_pool = n_per_side / params.proj_scale_factor; n_patches = n_per_side_2d_pool * n_per_side_2d_pool; } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { - n_patches /= params.proj_scale_factor; + n_patches /= (params.proj_scale_factor * params.proj_scale_factor); } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { int n_merge = params.spatial_merge_size; int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1); diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 82d2e3b97..3153b6159 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/server.cpp b/tools/server/server.cpp index e0e99eafc..06788bbdc 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3214,7 +3214,14 @@ struct server_context { batch.logits + i, }; - const int ret = llama_decode(ctx, batch_view); + int ret = 0; + + if (params_base.embedding || params_base.reranking) { + ret = llama_encode(ctx, batch_view); + } else { + ret = llama_decode(ctx, batch_view); + } + metrics.on_decoded(slots); if (ret != 0) { @@ -3943,7 +3950,7 @@ int main(int argc, char ** argv) { const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok]( server_task_type type, json & data, - std::function is_connection_closed, + const std::function & is_connection_closed, httplib::Response & res, oaicompat_type oaicompat) { GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json index b2e3cf94a..2c23a7580 100644 --- a/tools/server/webui/package-lock.json +++ b/tools/server/webui/package-lock.json @@ -21,6 +21,8 @@ "postcss": "^8.4.49", "react": "^18.3.1", "react-dom": "^18.3.1", + "react-dropzone": "^14.3.8", + "react-hot-toast": "^2.5.2", "react-markdown": "^9.0.3", "react-router": "^7.1.5", "rehype-highlight": "^7.0.2", @@ -2058,6 +2060,15 @@ "dev": true, "license": "Python-2.0" }, + "node_modules/attr-accept": { + "version": "2.2.5", + "resolved": "https://registry.npmjs.org/attr-accept/-/attr-accept-2.2.5.tgz", + "integrity": "sha512-0bDNnY/u6pPwHDMoF0FieU354oBi0a8rD9FcsLwzcGWbc8KS8KPIi7y+s13OlVY+gMWc/9xEMUgNE6Qm8ZllYQ==", + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/autoprefixer": { "version": "10.4.20", "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz", @@ -2804,6 +2815,18 @@ "node": ">=16.0.0" } }, + "node_modules/file-selector": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/file-selector/-/file-selector-2.1.2.tgz", + "integrity": "sha512-QgXo+mXTe8ljeqUFaX3QVHc5osSItJ/Km+xpocx0aSqWGMSCf6qYs/VnzZgS864Pjn5iceMRFigeAV7AfTlaig==", + "license": "MIT", + "dependencies": { + "tslib": "^2.7.0" + }, + "engines": { + "node": ">= 12" + } + }, "node_modules/fill-range": { "version": "7.1.1", "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", @@ -2917,6 +2940,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/goober": { + "version": "2.1.16", + "resolved": "https://registry.npmjs.org/goober/-/goober-2.1.16.tgz", + "integrity": "sha512-erjk19y1U33+XAMe1VTvIONHYoSqE4iS7BYUZfHaqeohLmnC0FdxEh7rQU+6MZ4OajItzjZFSRtVANrQwNq6/g==", + "license": "MIT", + "peerDependencies": { + "csstype": "^3.0.10" + } + }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", @@ -4674,6 +4706,15 @@ "node": ">=0.10.0" } }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/optionator": { "version": "0.9.4", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", @@ -4872,6 +4913,17 @@ "url": "https://github.com/prettier/prettier?sponsor=1" } }, + "node_modules/prop-types": { + "version": "15.8.1", + "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz", + "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.4.0", + "object-assign": "^4.1.1", + "react-is": "^16.13.1" + } + }, "node_modules/property-information": { "version": "6.5.0", "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz", @@ -4938,6 +4990,46 @@ "react": "^18.3.1" } }, + "node_modules/react-dropzone": { + "version": "14.3.8", + "resolved": "https://registry.npmjs.org/react-dropzone/-/react-dropzone-14.3.8.tgz", + "integrity": "sha512-sBgODnq+lcA4P296DY4wacOZz3JFpD99fp+hb//iBO2HHnyeZU3FwWyXJ6salNpqQdsZrgMrotuko/BdJMV8Ug==", + "license": "MIT", + "dependencies": { + "attr-accept": "^2.2.4", + "file-selector": "^2.1.0", + "prop-types": "^15.8.1" + }, + "engines": { + "node": ">= 10.13" + }, + "peerDependencies": { + "react": ">= 16.8 || 18.0.0" + } + }, + "node_modules/react-hot-toast": { + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/react-hot-toast/-/react-hot-toast-2.5.2.tgz", + "integrity": "sha512-Tun3BbCxzmXXM7C+NI4qiv6lT0uwGh4oAfeJyNOjYUejTsm35mK9iCaYLGv8cBz9L5YxZLx/2ii7zsIwPtPUdw==", + "license": "MIT", + "dependencies": { + "csstype": "^3.1.3", + "goober": "^2.1.16" + }, + "engines": { + "node": ">=10" + }, + "peerDependencies": { + "react": ">=16", + "react-dom": ">=16" + } + }, + "node_modules/react-is": { + "version": "16.13.1", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz", + "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==", + "license": "MIT" + }, "node_modules/react-markdown": { "version": "9.0.3", "resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-9.0.3.tgz", @@ -5814,7 +5906,6 @@ "version": "2.8.1", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", - "devOptional": true, "license": "0BSD" }, "node_modules/turbo-stream": { diff --git a/tools/server/webui/package.json b/tools/server/webui/package.json index 6ac06b1a4..ab1b920bd 100644 --- a/tools/server/webui/package.json +++ b/tools/server/webui/package.json @@ -24,6 +24,8 @@ "postcss": "^8.4.49", "react": "^18.3.1", "react-dom": "^18.3.1", + "react-dropzone": "^14.3.8", + "react-hot-toast": "^2.5.2", "react-markdown": "^9.0.3", "react-router": "^7.1.5", "rehype-highlight": "^7.0.2", diff --git a/tools/server/webui/src/App.tsx b/tools/server/webui/src/App.tsx index cc4659e15..3b00a8f90 100644 --- a/tools/server/webui/src/App.tsx +++ b/tools/server/webui/src/App.tsx @@ -4,6 +4,7 @@ import Sidebar from './components/Sidebar'; import { AppContextProvider, useAppContext } from './utils/app.context'; import ChatScreen from './components/ChatScreen'; import SettingDialog from './components/SettingDialog'; +import { Toaster } from 'react-hot-toast'; function App() { return ( @@ -40,6 +41,7 @@ function AppLayout() { onClose={() => setShowSettings(false)} /> } + ); } diff --git a/tools/server/webui/src/Config.ts b/tools/server/webui/src/Config.ts index dd1cc0e10..5eef608cb 100644 --- a/tools/server/webui/src/Config.ts +++ b/tools/server/webui/src/Config.ts @@ -12,7 +12,7 @@ export const CONFIG_DEFAULT = { // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value. // Do not use nested objects, keep it single level. Prefix the key if you need to group them. apiKey: '', - systemMessage: 'You are a helpful assistant.', + systemMessage: '', showTokensPerSecond: false, showThoughtInProgress: false, excludeThoughtOnReq: true, diff --git a/tools/server/webui/src/components/ChatInputExtraContextItem.tsx b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx new file mode 100644 index 000000000..ac416fa90 --- /dev/null +++ b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx @@ -0,0 +1,92 @@ +import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline'; +import { MessageExtra } from '../utils/types'; +import { useState } from 'react'; +import { classNames } from '../utils/misc'; + +export default function ChatInputExtraContextItem({ + items, + removeItem, + clickToShow, +}: { + items?: MessageExtra[]; + removeItem?: (index: number) => void; + clickToShow?: boolean; +}) { + const [show, setShow] = useState(-1); + const showingItem = show >= 0 ? items?.[show] : undefined; + + if (!items) return null; + + return ( +
+ {items.map((item, i) => ( +
clickToShow && setShow(i)} + > + {removeItem && ( +
+ +
+ )} + +
+ {item.type === 'imageFile' ? ( + <> + {item.name} + + ) : ( + <> +
+ +
+ +
+ {item.name ?? 'Extra content'} +
+ + )} +
+
+ ))} + + {showingItem && ( + +
+
+ {showingItem.name ?? 'Extra content'} + +
+ {showingItem.type === 'imageFile' ? ( + {showingItem.name} + ) : ( +
+
+                  {showingItem.content}
+                
+
+ )} +
+
setShow(-1)}>
+
+ )} +
+ ); +} diff --git a/tools/server/webui/src/components/ChatMessage.tsx b/tools/server/webui/src/components/ChatMessage.tsx index 40ea74711..08eb42352 100644 --- a/tools/server/webui/src/components/ChatMessage.tsx +++ b/tools/server/webui/src/components/ChatMessage.tsx @@ -3,7 +3,14 @@ import { useAppContext } from '../utils/app.context'; import { Message, PendingMessage } from '../utils/types'; import { classNames } from '../utils/misc'; import MarkdownDisplay, { CopyButton } from './MarkdownDisplay'; -import { ChevronLeftIcon, ChevronRightIcon } from '@heroicons/react/24/outline'; +import { + ArrowPathIcon, + ChevronLeftIcon, + ChevronRightIcon, + PencilSquareIcon, +} from '@heroicons/react/24/outline'; +import ChatInputExtraContextItem from './ChatInputExtraContextItem'; +import { BtnWithTooltips } from '../utils/common'; interface SplitMessage { content: PendingMessage['content']; @@ -85,10 +92,14 @@ export default function ChatMessage({ 'chat-end': msg.role === 'user', })} > + {msg.extra && msg.extra.length > 0 && ( + + )} +
{/* textarea for editing message */} @@ -133,59 +144,11 @@ export default function ChatMessage({ {/* render message as markdown */}
{thought && ( -
- - {isPending && isThinking ? ( - - - Thinking - - ) : ( - Thought Process - )} - -
- -
-
- )} - - {msg.extra && msg.extra.length > 0 && ( -
- - Extra content - -
- {msg.extra.map( - (extra, i) => - extra.type === 'textFile' ? ( -
- {extra.name} -
{extra.content}
-
- ) : extra.type === 'context' ? ( -
-
{extra.content}
-
- ) : null // TODO: support other extra types - )} -
-
+ )} setEditingContent(msg.content)} disabled={msg.content === null} + tooltipsContent="Edit message" > - ✍️ Edit - + + )} {/* assistant message */} {msg.role === 'assistant' && ( <> {!isPending && ( - + + )} )}
@@ -294,3 +259,44 @@ export default function ChatMessage({
); } + +function ThoughtProcess({ + isThinking, + content, + open, +}: { + isThinking: boolean; + content: string; + open: boolean; +}) { + return ( +
+ +
+
+ {isThinking ? ( + + + Thinking + + ) : ( + <>Thought Process + )} +
+
+
+
+ +
+
+
+ ); +} diff --git a/tools/server/webui/src/components/ChatScreen.tsx b/tools/server/webui/src/components/ChatScreen.tsx index a2e3ee997..b645a494d 100644 --- a/tools/server/webui/src/components/ChatScreen.tsx +++ b/tools/server/webui/src/components/ChatScreen.tsx @@ -1,12 +1,25 @@ -import { useEffect, useMemo, useState } from 'react'; +import { useEffect, useMemo, useRef, useState } from 'react'; import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context'; import ChatMessage from './ChatMessage'; import { CanvasType, Message, PendingMessage } from '../utils/types'; -import { classNames, cleanCurrentUrl, throttle } from '../utils/misc'; +import { classNames, cleanCurrentUrl } from '../utils/misc'; import CanvasPyInterpreter from './CanvasPyInterpreter'; import StorageUtils from '../utils/storage'; import { useVSCodeContext } from '../utils/llama-vscode'; import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts'; +import { + ArrowUpIcon, + StopIcon, + PaperClipIcon, +} from '@heroicons/react/24/solid'; +import { + ChatExtraContextApi, + useChatExtraContext, +} from './useChatExtraContext.tsx'; +import Dropzone from 'react-dropzone'; +import toast from 'react-hot-toast'; +import ChatInputExtraContextItem from './ChatInputExtraContextItem.tsx'; +import { scrollToBottom, useChatScroll } from './useChatScroll.tsx'; /** * A message display is a message node with additional information for rendering. @@ -72,24 +85,6 @@ function getListMessageDisplay( return res; } -const scrollToBottom = throttle( - (requiresNearBottom: boolean, delay: number = 80) => { - const mainScrollElem = document.getElementById('main-scroll'); - if (!mainScrollElem) return; - const spaceToBottom = - mainScrollElem.scrollHeight - - mainScrollElem.scrollTop - - mainScrollElem.clientHeight; - if (!requiresNearBottom || spaceToBottom < 50) { - setTimeout( - () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }), - delay - ); - } - }, - 80 -); - export default function ChatScreen() { const { viewingChat, @@ -102,10 +97,11 @@ export default function ChatScreen() { } = useAppContext(); const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content()); + const extraContext = useChatExtraContext(); + useVSCodeContext(textarea, extraContext); - const { extraContext, clearExtraContext } = useVSCodeContext(textarea); - // TODO: improve this when we have "upload file" feature - const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined; + const msgListRef = useRef(null); + useChatScroll(msgListRef); // keep track of leaf node for rendering const [currNodeId, setCurrNodeId] = useState(-1); @@ -129,13 +125,15 @@ export default function ChatScreen() { if (currLeafNodeId) { setCurrNodeId(currLeafNodeId); } - scrollToBottom(true); + // useChatScroll will handle the auto scroll }; const sendNewMessage = async () => { const lastInpMsg = textarea.value(); - if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? '')) + if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? '')) { + toast.error('Please enter a message'); return; + } textarea.setValue(''); scrollToBottom(false); setCurrNodeId(-1); @@ -146,7 +144,7 @@ export default function ChatScreen() { currConvId, lastMsgNodeId, lastInpMsg, - currExtra, + extraContext.items, onChunk )) ) { @@ -154,7 +152,7 @@ export default function ChatScreen() { textarea.setValue(lastInpMsg); } // OK - clearExtraContext(); + extraContext.clearItems(); }; // for vscode context @@ -234,10 +232,17 @@ export default function ChatScreen() { })} > {/* chat messages */} -
-
+
+
{/* placeholder to shift the message to the bottom */} - {viewingChat ? '' : 'Send a message to start'} + {viewingChat ? ( + '' + ) : ( + <> +
Send a message to start
+ + + )}
{[...messages, ...pendingMsgDisplay].map((msg) => ( ))}
{/* chat input */} -
- - - {isGenerating(currConvId ?? '') ? ( - - ) : ( - - )} -
+ stopGenerating(currConvId ?? '')} + isGenerating={isGenerating(currConvId ?? '')} + />
{canvasData?.type === CanvasType.PY_INTERPRETER && ( @@ -297,3 +275,129 @@ export default function ChatScreen() {
); } + +function ServerInfo() { + const { serverProps } = useAppContext(); + return ( +
+
+ Server Info +

+ Model: {serverProps?.model_path?.split(/(\\|\/)/).pop()} +
+ Build: {serverProps?.build_info} +
+

+
+
+ ); +} + +function ChatInput({ + textarea, + extraContext, + onSend, + onStop, + isGenerating, +}: { + textarea: ChatTextareaApi; + extraContext: ChatExtraContextApi; + onSend: () => void; + onStop: () => void; + isGenerating: boolean; +}) { + const [isDrag, setIsDrag] = useState(false); + + return ( +
+ { + setIsDrag(false); + extraContext.onFileAdded(files); + }} + onDragEnter={() => setIsDrag(true)} + onDragLeave={() => setIsDrag(false)} + multiple={true} + > + {({ getRootProps, getInputProps }) => ( +
+ {!isGenerating && ( + + )} + +
+ + + {/* buttons area */} +
+ + + {isGenerating ? ( + + ) : ( + + )} +
+
+
+ )} +
+
+ ); +} diff --git a/tools/server/webui/src/components/Header.tsx b/tools/server/webui/src/components/Header.tsx index 4c6b291e6..45775ff7a 100644 --- a/tools/server/webui/src/components/Header.tsx +++ b/tools/server/webui/src/components/Header.tsx @@ -4,10 +4,13 @@ import { useAppContext } from '../utils/app.context'; import { classNames } from '../utils/misc'; import daisyuiThemes from 'daisyui/theme/object'; import { THEMES } from '../Config'; -import { useNavigate } from 'react-router'; +import { + Cog8ToothIcon, + MoonIcon, + Bars3Icon, +} from '@heroicons/react/24/outline'; export default function Header() { - const navigate = useNavigate(); const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme()); const { setShowSettings } = useAppContext(); @@ -24,105 +27,21 @@ export default function Header() { ); }, [selectedTheme]); - const { isGenerating, viewingChat } = useAppContext(); - const isCurrConvGenerating = isGenerating(viewingChat?.conv.id ?? ''); - - const removeConversation = () => { - if (isCurrConvGenerating || !viewingChat) return; - const convId = viewingChat?.conv.id; - if (window.confirm('Are you sure to delete this conversation?')) { - StorageUtils.remove(convId); - navigate('/'); - } - }; - - const downloadConversation = () => { - if (isCurrConvGenerating || !viewingChat) return; - const convId = viewingChat?.conv.id; - const conversationJson = JSON.stringify(viewingChat, null, 2); - const blob = new Blob([conversationJson], { type: 'application/json' }); - const url = URL.createObjectURL(blob); - const a = document.createElement('a'); - a.href = url; - a.download = `conversation_${convId}.json`; - document.body.appendChild(a); - a.click(); - document.body.removeChild(a); - URL.revokeObjectURL(url); - }; - return (
{/* open sidebar button */}
llama.cpp
{/* action buttons (top right) */}
- {viewingChat && ( -
- {/* "..." button */} - - {/* dropdown menu */} - -
- )} -
@@ -130,16 +49,7 @@ export default function Header() {
- - - +
    - + {canRunCode && ( )} @@ -101,16 +106,17 @@ export const CopyButton = ({ }) => { const [copied, setCopied] = useState(false); return ( - + + ); }; @@ -124,7 +130,7 @@ export const RunPyCodeButton = ({ const { setCanvasData } = useAppContext(); return ( <> - + + ); }; diff --git a/tools/server/webui/src/components/Sidebar.tsx b/tools/server/webui/src/components/Sidebar.tsx index 34727c623..1a6c8a327 100644 --- a/tools/server/webui/src/components/Sidebar.tsx +++ b/tools/server/webui/src/components/Sidebar.tsx @@ -1,13 +1,25 @@ -import { useEffect, useState } from 'react'; +import { useEffect, useMemo, useState } from 'react'; import { classNames } from '../utils/misc'; import { Conversation } from '../utils/types'; import StorageUtils from '../utils/storage'; import { useNavigate, useParams } from 'react-router'; +import { + ArrowDownTrayIcon, + EllipsisVerticalIcon, + PencilIcon, + TrashIcon, + XMarkIcon, +} from '@heroicons/react/24/outline'; +import { BtnWithTooltips } from '../utils/common'; +import { useAppContext } from '../utils/app.context'; +import toast from 'react-hot-toast'; export default function Sidebar() { const params = useParams(); const navigate = useNavigate(); + const { isGenerating } = useAppContext(); + const [conversations, setConversations] = useState([]); const [currConv, setCurrConv] = useState(null); @@ -26,6 +38,11 @@ export default function Sidebar() { }; }, []); + const groupedConv = useMemo( + () => groupConversationsByDate(conversations), + [conversations] + ); + return ( <> - - - +
- {/* list of conversations */} + {/* new conversation button */}
navigate('/')} > + New conversation
- {conversations.map((conv) => ( -
navigate(`/chat/${conv.id}`)} - dir="auto" - > - {conv.name} + + {/* list of conversations */} + {groupedConv.map((group) => ( +
+ {/* group name (by date) */} + {group.title ? ( + {group.title} + ) : ( +
+ )} + + {group.conversations.map((conv) => ( + { + navigate(`/chat/${conv.id}`); + }} + onDelete={() => { + if (isGenerating(conv.id)) { + toast.error( + 'Cannot delete conversation while generating' + ); + return; + } + if ( + window.confirm( + 'Are you sure to delete this conversation?' + ) + ) { + toast.success('Conversation deleted'); + StorageUtils.remove(conv.id); + navigate('/'); + } + }} + onDownload={() => { + if (isGenerating(conv.id)) { + toast.error( + 'Cannot download conversation while generating' + ); + return; + } + const conversationJson = JSON.stringify(conv, null, 2); + const blob = new Blob([conversationJson], { + type: 'application/json', + }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = `conversation_${conv.id}.json`; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); + }} + onRename={() => { + if (isGenerating(conv.id)) { + toast.error( + 'Cannot rename conversation while generating' + ); + return; + } + const newName = window.prompt( + 'Enter new name for the conversation', + conv.name + ); + if (newName && newName.trim().length > 0) { + StorageUtils.updateConversationName(conv.id, newName); + } + }} + /> + ))}
))} -
+
Conversations are saved to browser's IndexedDB
@@ -94,3 +161,170 @@ export default function Sidebar() { ); } + +function ConversationItem({ + conv, + isCurrConv, + onSelect, + onDelete, + onDownload, + onRename, +}: { + conv: Conversation; + isCurrConv: boolean; + onSelect: () => void; + onDelete: () => void; + onDownload: () => void; + onRename: () => void; +}) { + return ( +
+
+ {conv.name} +
+
+ + {/* dropdown menu */} + +
+
+ ); +} + +// WARN: vibe code below + +export interface GroupedConversations { + title?: string; + conversations: Conversation[]; +} + +// TODO @ngxson : add test for this function +// Group conversations by date +// - "Previous 7 Days" +// - "Previous 30 Days" +// - "Month Year" (e.g., "April 2023") +export function groupConversationsByDate( + conversations: Conversation[] +): GroupedConversations[] { + const now = new Date(); + const today = new Date(now.getFullYear(), now.getMonth(), now.getDate()); // Start of today + + const sevenDaysAgo = new Date(today); + sevenDaysAgo.setDate(today.getDate() - 7); + + const thirtyDaysAgo = new Date(today); + thirtyDaysAgo.setDate(today.getDate() - 30); + + const groups: { [key: string]: Conversation[] } = { + Today: [], + 'Previous 7 Days': [], + 'Previous 30 Days': [], + }; + const monthlyGroups: { [key: string]: Conversation[] } = {}; // Key format: "Month Year" e.g., "April 2023" + + // Sort conversations by lastModified date in descending order (newest first) + // This helps when adding to groups, but the final output order of groups is fixed. + const sortedConversations = [...conversations].sort( + (a, b) => b.lastModified - a.lastModified + ); + + for (const conv of sortedConversations) { + const convDate = new Date(conv.lastModified); + + if (convDate >= today) { + groups['Today'].push(conv); + } else if (convDate >= sevenDaysAgo) { + groups['Previous 7 Days'].push(conv); + } else if (convDate >= thirtyDaysAgo) { + groups['Previous 30 Days'].push(conv); + } else { + const monthName = convDate.toLocaleString('default', { month: 'long' }); + const year = convDate.getFullYear(); + const monthYearKey = `${monthName} ${year}`; + if (!monthlyGroups[monthYearKey]) { + monthlyGroups[monthYearKey] = []; + } + monthlyGroups[monthYearKey].push(conv); + } + } + + const result: GroupedConversations[] = []; + + if (groups['Today'].length > 0) { + result.push({ + title: undefined, // no title for Today + conversations: groups['Today'], + }); + } + + if (groups['Previous 7 Days'].length > 0) { + result.push({ + title: 'Previous 7 Days', + conversations: groups['Previous 7 Days'], + }); + } + + if (groups['Previous 30 Days'].length > 0) { + result.push({ + title: 'Previous 30 Days', + conversations: groups['Previous 30 Days'], + }); + } + + // Sort monthly groups by date (most recent month first) + const sortedMonthKeys = Object.keys(monthlyGroups).sort((a, b) => { + const dateA = new Date(a); // "Month Year" can be parsed by Date constructor + const dateB = new Date(b); + return dateB.getTime() - dateA.getTime(); + }); + + for (const monthKey of sortedMonthKeys) { + if (monthlyGroups[monthKey].length > 0) { + result.push({ title: monthKey, conversations: monthlyGroups[monthKey] }); + } + } + + return result; +} diff --git a/tools/server/webui/src/components/useChatExtraContext.tsx b/tools/server/webui/src/components/useChatExtraContext.tsx new file mode 100644 index 000000000..866401db9 --- /dev/null +++ b/tools/server/webui/src/components/useChatExtraContext.tsx @@ -0,0 +1,174 @@ +import { useState } from 'react'; +import { MessageExtra } from '../utils/types'; +import toast from 'react-hot-toast'; +import { useAppContext } from '../utils/app.context'; + +// Interface describing the API returned by the hook +export interface ChatExtraContextApi { + items?: MessageExtra[]; // undefined if empty, similar to Message['extra'] + addItems: (items: MessageExtra[]) => void; + removeItem: (idx: number) => void; + clearItems: () => void; + onFileAdded: (files: File[]) => void; // used by "upload" button +} + +export function useChatExtraContext(): ChatExtraContextApi { + const { serverProps } = useAppContext(); + const [items, setItems] = useState([]); + + const addItems = (newItems: MessageExtra[]) => { + setItems((prev) => [...prev, ...newItems]); + }; + + const removeItem = (idx: number) => { + setItems((prev) => prev.filter((_, i) => i !== idx)); + }; + + const clearItems = () => { + setItems([]); + }; + + const onFileAdded = (files: File[]) => { + for (const file of files) { + const mimeType = file.type; + console.debug({ mimeType, file }); + if (file.size > 10 * 1024 * 1024) { + toast.error('File is too large. Maximum size is 10MB.'); + break; + } + + if (mimeType.startsWith('image/') && mimeType !== 'image/svg+xml') { + if (!serverProps?.has_multimodal) { + toast.error('Multimodal is not supported by this server or model.'); + break; + } + const reader = new FileReader(); + reader.onload = (event) => { + if (event.target?.result) { + addItems([ + { + type: 'imageFile', + name: file.name, + base64Url: event.target.result as string, + }, + ]); + } + }; + reader.readAsDataURL(file); + } else if ( + mimeType.startsWith('video/') || + mimeType.startsWith('audio/') + ) { + toast.error('Video and audio files are not supported yet.'); + break; + } else if (mimeType.startsWith('application/pdf')) { + toast.error('PDF files are not supported yet.'); + break; + } else { + // Because there can be many text file types (like code file), we will not check the mime type + // and will just check if the file is not binary. + const reader = new FileReader(); + reader.onload = (event) => { + if (event.target?.result) { + const content = event.target.result as string; + if (!isLikelyNotBinary(content)) { + toast.error('File is binary. Please upload a text file.'); + return; + } + addItems([ + { + type: 'textFile', + name: file.name, + content, + }, + ]); + } + }; + reader.readAsText(file); + } + } + }; + + return { + items: items.length > 0 ? items : undefined, + addItems, + removeItem, + clearItems, + onFileAdded, + }; +} + +// WARN: vibe code below +// This code is a heuristic to determine if a string is likely not binary. +// It is necessary because input file can have various mime types which we don't have time to investigate. +// For example, a python file can be text/plain, application/x-python, etc. +export function isLikelyNotBinary(str: string): boolean { + const options = { + prefixLength: 1024 * 10, // Check the first 10KB of the string + suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars + maxAbsoluteNullBytes: 2, + }; + + if (!str) { + return true; // Empty string is considered "not binary" or trivially text. + } + + const sampleLength = Math.min(str.length, options.prefixLength); + if (sampleLength === 0) { + return true; // Effectively an empty string after considering prefixLength. + } + + let suspiciousCharCount = 0; + let nullByteCount = 0; + + for (let i = 0; i < sampleLength; i++) { + const charCode = str.charCodeAt(i); + + // 1. Check for Unicode Replacement Character (U+FFFD) + // This is a strong indicator if the string was created from decoding bytes as UTF-8. + if (charCode === 0xfffd) { + suspiciousCharCount++; + continue; + } + + // 2. Check for Null Bytes (U+0000) + if (charCode === 0x0000) { + nullByteCount++; + // We also count nulls towards the general suspicious character count, + // as they are less common in typical text files. + suspiciousCharCount++; + continue; + } + + // 3. Check for C0 Control Characters (U+0001 to U+001F) + // Exclude common text control characters: TAB (9), LF (10), CR (13). + // We can also be a bit lenient with BEL (7) and BS (8) which sometimes appear in logs. + if (charCode < 32) { + if ( + charCode !== 9 && // TAB + charCode !== 10 && // LF + charCode !== 13 && // CR + charCode !== 7 && // BEL (Bell) - sometimes in logs + charCode !== 8 // BS (Backspace) - less common, but possible + ) { + suspiciousCharCount++; + } + } + // Characters from 32 (space) up to 126 (~) are printable ASCII. + // Characters 127 (DEL) is a control character. + // Characters >= 128 are extended ASCII / multi-byte Unicode. + // If they resulted in U+FFFD, we caught it. Otherwise, they are valid + // (though perhaps unusual) Unicode characters from JS's perspective. + // The main concern is if those higher characters came from misinterpreting + // a single-byte encoding as UTF-8, which again, U+FFFD would usually flag. + } + + // Check absolute null byte count + if (nullByteCount > options.maxAbsoluteNullBytes) { + return false; // Too many null bytes is a strong binary indicator + } + + // Check ratio of suspicious characters + const ratio = suspiciousCharCount / sampleLength; + return ratio <= options.suspiciousCharThresholdRatio; +} diff --git a/tools/server/webui/src/components/useChatScroll.tsx b/tools/server/webui/src/components/useChatScroll.tsx new file mode 100644 index 000000000..25ea02234 --- /dev/null +++ b/tools/server/webui/src/components/useChatScroll.tsx @@ -0,0 +1,34 @@ +import React, { useEffect } from 'react'; +import { throttle } from '../utils/misc'; + +export const scrollToBottom = (requiresNearBottom: boolean, delay?: number) => { + const mainScrollElem = document.getElementById('main-scroll'); + if (!mainScrollElem) return; + const spaceToBottom = + mainScrollElem.scrollHeight - + mainScrollElem.scrollTop - + mainScrollElem.clientHeight; + if (!requiresNearBottom || spaceToBottom < 100) { + setTimeout( + () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }), + delay ?? 80 + ); + } +}; + +const scrollToBottomThrottled = throttle(scrollToBottom, 80); + +export function useChatScroll(msgListRef: React.RefObject) { + useEffect(() => { + if (!msgListRef.current) return; + + const resizeObserver = new ResizeObserver((_) => { + scrollToBottomThrottled(true, 10); + }); + + resizeObserver.observe(msgListRef.current); + return () => { + resizeObserver.disconnect(); + }; + }, [msgListRef]); +} diff --git a/tools/server/webui/src/components/useChatTextarea.ts b/tools/server/webui/src/components/useChatTextarea.ts index a3223f4fd..c2f865203 100644 --- a/tools/server/webui/src/components/useChatTextarea.ts +++ b/tools/server/webui/src/components/useChatTextarea.ts @@ -1,35 +1,39 @@ import { useEffect, useRef, useState, useCallback } from 'react'; +import { throttle } from '../utils/misc'; // Media Query for detecting "large" screens (matching Tailwind's lg: breakpoint) const LARGE_SCREEN_MQ = '(min-width: 1024px)'; // Calculates and sets the textarea height based on its scrollHeight -const adjustTextareaHeight = (textarea: HTMLTextAreaElement | null) => { - if (!textarea) return; +const adjustTextareaHeight = throttle( + (textarea: HTMLTextAreaElement | null) => { + if (!textarea) return; - // Only perform auto-sizing on large screens - if (!window.matchMedia(LARGE_SCREEN_MQ).matches) { - // On small screens, reset inline height and max-height styles. - // This allows CSS (e.g., `rows` attribute or classes) to control the height, - // and enables manual resizing if `resize-vertical` is set. - textarea.style.height = ''; // Use 'auto' or '' to reset - textarea.style.maxHeight = ''; - return; // Do not adjust height programmatically on small screens - } + // Only perform auto-sizing on large screens + if (!window.matchMedia(LARGE_SCREEN_MQ).matches) { + // On small screens, reset inline height and max-height styles. + // This allows CSS (e.g., `rows` attribute or classes) to control the height, + // and enables manual resizing if `resize-vertical` is set. + textarea.style.height = ''; // Use 'auto' or '' to reset + textarea.style.maxHeight = ''; + return; // Do not adjust height programmatically on small screens + } - const computedStyle = window.getComputedStyle(textarea); - // Get the max-height specified by CSS (e.g., from `lg:max-h-48`) - const currentMaxHeight = computedStyle.maxHeight; + const computedStyle = window.getComputedStyle(textarea); + // Get the max-height specified by CSS (e.g., from `lg:max-h-48`) + const currentMaxHeight = computedStyle.maxHeight; - // Temporarily remove max-height to allow scrollHeight to be calculated correctly - textarea.style.maxHeight = 'none'; - // Reset height to 'auto' to measure the actual scrollHeight needed - textarea.style.height = 'auto'; - // Set the height to the calculated scrollHeight - textarea.style.height = `${textarea.scrollHeight}px`; - // Re-apply the original max-height from CSS to enforce the limit - textarea.style.maxHeight = currentMaxHeight; -}; + // Temporarily remove max-height to allow scrollHeight to be calculated correctly + textarea.style.maxHeight = 'none'; + // Reset height to 'auto' to measure the actual scrollHeight needed + textarea.style.height = 'auto'; + // Set the height to the calculated scrollHeight + textarea.style.height = `${textarea.scrollHeight}px`; + // Re-apply the original max-height from CSS to enforce the limit + textarea.style.maxHeight = currentMaxHeight; + }, + 100 +); // Throttle to prevent excessive calls // Interface describing the API returned by the hook export interface ChatTextareaApi { @@ -65,6 +69,7 @@ export function useChatTextarea(initValue: string): ChatTextareaApi { } }, [textareaRef, savedInitValue]); // Depend on ref and savedInitValue + // On input change, we adjust the height of the textarea const handleInput = useCallback( (event: React.FormEvent) => { // Call adjustTextareaHeight on every input - it will decide whether to act @@ -94,6 +99,6 @@ export function useChatTextarea(initValue: string): ChatTextareaApi { }, ref: textareaRef, refOnSubmit: onSubmitRef, - onInput: handleInput, + onInput: handleInput, // for adjusting height on input }; } diff --git a/tools/server/webui/src/index.scss b/tools/server/webui/src/index.scss index a18f09454..563e7a461 100644 --- a/tools/server/webui/src/index.scss +++ b/tools/server/webui/src/index.scss @@ -22,12 +22,15 @@ html { all: revert; } pre { - @apply whitespace-pre-wrap rounded-lg p-2; + @apply whitespace-pre-wrap rounded-lg p-2 mb-3; border: 1px solid currentColor; } p { @apply mb-2; } + hr { + @apply my-4 border-base-content/20 border-1; + } /* TODO: fix markdown table */ } @@ -35,7 +38,7 @@ html { @apply md:opacity-0 md:group-hover:opacity-100; } .btn-mini { - @apply cursor-pointer hover:shadow-md; + @apply cursor-pointer; } .chat-screen { max-width: 900px; diff --git a/tools/server/webui/src/utils/app.context.tsx b/tools/server/webui/src/utils/app.context.tsx index 54bb65b6e..96cffd95a 100644 --- a/tools/server/webui/src/utils/app.context.tsx +++ b/tools/server/webui/src/utils/app.context.tsx @@ -3,6 +3,7 @@ import { APIMessage, CanvasData, Conversation, + LlamaCppServerProps, Message, PendingMessage, ViewingChat, @@ -12,9 +13,11 @@ import { filterThoughtFromMsgs, normalizeMsgsForAPI, getSSEStreamAsync, + getServerProps, } from './misc'; import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config'; import { matchPath, useLocation, useNavigate } from 'react-router'; +import toast from 'react-hot-toast'; interface AppContextValue { // conversations and messages @@ -46,6 +49,9 @@ interface AppContextValue { saveConfig: (config: typeof CONFIG_DEFAULT) => void; showSettings: boolean; setShowSettings: (show: boolean) => void; + + // props + serverProps: LlamaCppServerProps | null; } // this callback is used for scrolling to the bottom of the chat and switching to the last node @@ -74,6 +80,9 @@ export const AppContextProvider = ({ const params = matchPath('/chat/:convId', pathname); const convId = params?.params?.convId; + const [serverProps, setServerProps] = useState( + null + ); const [viewingChat, setViewingChat] = useState(null); const [pendingMessages, setPendingMessages] = useState< Record @@ -85,6 +94,20 @@ export const AppContextProvider = ({ const [canvasData, setCanvasData] = useState(null); const [showSettings, setShowSettings] = useState(false); + // get server props + useEffect(() => { + getServerProps(BASE_URL, config.apiKey) + .then((props) => { + console.debug('Server props:', props); + setServerProps(props); + }) + .catch((err) => { + console.error(err); + toast.error('Failed to fetch server props'); + }); + // eslint-disable-next-line + }, []); + // handle change when the convId from URL is changed useEffect(() => { // also reset the canvas data @@ -260,7 +283,7 @@ export const AppContextProvider = ({ } else { console.error(err); // eslint-disable-next-line @typescript-eslint/no-explicit-any - alert((err as any)?.message ?? 'Unknown error'); + toast.error((err as any)?.message ?? 'Unknown error'); throw err; // rethrow } } @@ -377,6 +400,7 @@ export const AppContextProvider = ({ saveConfig, showSettings, setShowSettings, + serverProps, }} > {children} diff --git a/tools/server/webui/src/utils/common.tsx b/tools/server/webui/src/utils/common.tsx index 09b08b5c9..372f464a2 100644 --- a/tools/server/webui/src/utils/common.tsx +++ b/tools/server/webui/src/utils/common.tsx @@ -36,3 +36,32 @@ export const OpenInNewTab = ({ {children} ); + +export function BtnWithTooltips({ + className, + onClick, + onMouseLeave, + children, + tooltipsContent, + disabled, +}: { + className?: string; + onClick: () => void; + onMouseLeave?: () => void; + children: React.ReactNode; + tooltipsContent: string; + disabled?: boolean; +}) { + return ( +
+ +
+ ); +} diff --git a/tools/server/webui/src/utils/llama-vscode.ts b/tools/server/webui/src/utils/llama-vscode.ts index 55ebdcffc..0ad8f8042 100644 --- a/tools/server/webui/src/utils/llama-vscode.ts +++ b/tools/server/webui/src/utils/llama-vscode.ts @@ -1,6 +1,6 @@ -import { useEffect, useState } from 'react'; -import { MessageExtraContext } from './types'; +import { useEffect } from 'react'; import { ChatTextareaApi } from '../components/useChatTextarea.ts'; +import { ChatExtraContextApi } from '../components/useChatExtraContext.tsx'; // Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe // Ref: https://github.com/ggml-org/llama.cpp/pull/11940 @@ -15,11 +15,10 @@ interface SetTextEvData { * window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*'); */ -export const useVSCodeContext = (textarea: ChatTextareaApi) => { - const [extraContext, setExtraContext] = useState( - null - ); - +export const useVSCodeContext = ( + textarea: ChatTextareaApi, + extraContext: ChatExtraContextApi +) => { // Accept setText message from a parent window and set inputMsg and extraContext useEffect(() => { const handleMessage = (event: MessageEvent) => { @@ -27,10 +26,14 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => { const data: SetTextEvData = event.data; textarea.setValue(data?.text); if (data?.context && data.context.length > 0) { - setExtraContext({ - type: 'context', - content: data.context, - }); + extraContext.clearItems(); + extraContext.addItems([ + { + type: 'context', + name: 'Extra context', + content: data.context, + }, + ]); } textarea.focus(); setTimeout(() => { @@ -41,7 +44,7 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => { window.addEventListener('message', handleMessage); return () => window.removeEventListener('message', handleMessage); - }, [textarea]); + }, [textarea, extraContext]); // Add a keydown listener that sends the "escapePressed" message to the parent window useEffect(() => { @@ -55,9 +58,5 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => { return () => window.removeEventListener('keydown', handleKeyDown); }, []); - return { - extraContext, - // call once the user message is sent, to clear the extra context - clearExtraContext: () => setExtraContext(null), - }; + return {}; }; diff --git a/tools/server/webui/src/utils/misc.ts b/tools/server/webui/src/utils/misc.ts index 87f55b2af..ba760e83b 100644 --- a/tools/server/webui/src/utils/misc.ts +++ b/tools/server/webui/src/utils/misc.ts @@ -1,6 +1,11 @@ // @ts-expect-error this package does not have typing import TextLineStream from 'textlinestream'; -import { APIMessage, Message } from './types'; +import { + APIMessage, + APIMessageContentPart, + LlamaCppServerProps, + Message, +} from './types'; // ponyfill for missing ReadableStream asyncIterator on Safari import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator'; @@ -57,19 +62,47 @@ export const copyStr = (textToCopy: string) => { */ export function normalizeMsgsForAPI(messages: Readonly) { return messages.map((msg) => { - let newContent = ''; + if (msg.role !== 'user' || !msg.extra) { + return { + role: msg.role, + content: msg.content, + } as APIMessage; + } + + // extra content first, then user text message in the end + // this allow re-using the same cache prefix for long context + const contentArr: APIMessageContentPart[] = []; for (const extra of msg.extra ?? []) { if (extra.type === 'context') { - newContent += `${extra.content}\n\n`; + contentArr.push({ + type: 'text', + text: extra.content, + }); + } else if (extra.type === 'textFile') { + contentArr.push({ + type: 'text', + text: `File: ${extra.name}\nContent:\n\n${extra.content}`, + }); + } else if (extra.type === 'imageFile') { + contentArr.push({ + type: 'image_url', + image_url: { url: extra.base64Url }, + }); + } else { + throw new Error('Unknown extra type'); } } - newContent += msg.content; + // add user message to the end + contentArr.push({ + type: 'text', + text: msg.content, + }); return { role: msg.role, - content: newContent, + content: contentArr, }; }) as APIMessage[]; } @@ -78,13 +111,19 @@ export function normalizeMsgsForAPI(messages: Readonly) { * recommended for DeepsSeek-R1, filter out content between and tags */ export function filterThoughtFromMsgs(messages: APIMessage[]) { + console.debug({ messages }); return messages.map((msg) => { + if (msg.role !== 'assistant') { + return msg; + } + // assistant message is always a string + const contentStr = msg.content as string; return { role: msg.role, content: msg.role === 'assistant' - ? msg.content.split('').at(-1)!.trim() - : msg.content, + ? contentStr.split('').at(-1)!.trim() + : contentStr, } as APIMessage; }); } @@ -126,3 +165,25 @@ export const cleanCurrentUrl = (removeQueryParams: string[]) => { }); window.history.replaceState({}, '', url.toString()); }; + +export const getServerProps = async ( + baseUrl: string, + apiKey?: string +): Promise => { + try { + const response = await fetch(`${baseUrl}/props`, { + headers: { + 'Content-Type': 'application/json', + ...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}), + }, + }); + if (!response.ok) { + throw new Error('Failed to fetch server props'); + } + const data = await response.json(); + return data as LlamaCppServerProps; + } catch (error) { + console.error('Error fetching server props:', error); + throw error; + } +}; diff --git a/tools/server/webui/src/utils/storage.ts b/tools/server/webui/src/utils/storage.ts index 1dfc9d979..505693e92 100644 --- a/tools/server/webui/src/utils/storage.ts +++ b/tools/server/webui/src/utils/storage.ts @@ -116,6 +116,16 @@ const StorageUtils = { }); return conv; }, + /** + * update the name of a conversation + */ + async updateConversationName(convId: string, name: string): Promise { + await db.conversations.update(convId, { + name, + lastModified: Date.now(), + }); + dispatchConversationChange(convId); + }, /** * if convId does not exist, throw an error */ diff --git a/tools/server/webui/src/utils/types.ts b/tools/server/webui/src/utils/types.ts index 0eb774001..add48be4c 100644 --- a/tools/server/webui/src/utils/types.ts +++ b/tools/server/webui/src/utils/types.ts @@ -48,7 +48,10 @@ export interface Message { children: Message['id'][]; } -type MessageExtra = MessageExtraTextFile | MessageExtraContext; // TODO: will add more in the future +export type MessageExtra = + | MessageExtraTextFile + | MessageExtraImageFile + | MessageExtraContext; export interface MessageExtraTextFile { type: 'textFile'; @@ -56,12 +59,32 @@ export interface MessageExtraTextFile { content: string; } +export interface MessageExtraImageFile { + type: 'imageFile'; + name: string; + base64Url: string; +} + export interface MessageExtraContext { type: 'context'; + name: string; content: string; } -export type APIMessage = Pick; +export type APIMessageContentPart = + | { + type: 'text'; + text: string; + } + | { + type: 'image_url'; + image_url: { url: string }; + }; + +export type APIMessage = { + role: Message['role']; + content: string | APIMessageContentPart[]; +}; export interface Conversation { id: string; // format: `conv-{timestamp}` @@ -89,3 +112,12 @@ export interface CanvasPyInterpreter { } export type CanvasData = CanvasPyInterpreter; + +// a non-complete list of props, only contains the ones we need +export interface LlamaCppServerProps { + build_info: string; + model_path: string; + n_ctx: number; + has_multimodal: boolean; + // TODO: support params +} diff --git a/tools/server/webui/vite.config.ts b/tools/server/webui/vite.config.ts index b8a0f03d9..366df3b75 100644 --- a/tools/server/webui/vite.config.ts +++ b/tools/server/webui/vite.config.ts @@ -71,6 +71,7 @@ export default defineConfig({ server: { proxy: { '/v1': 'http://localhost:8080', + '/props': 'http://localhost:8080', }, headers: { 'Cross-Origin-Embedder-Policy': 'require-corp',