koboldcpp/.github/workflows/server-self-hosted.yml

name: Server (self-hosted)

on:
  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: [
      '.github/workflows/server-self-hosted.yml',
      '**/CMakeLists.txt',
      '**/Makefile',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.swift',
      '**/*.m',
      'tools/server/**.*'
    ]

env:
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
  LLAMA_LOG_VERBOSITY: 10

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  server-metal:
    runs-on: [self-hosted, llama-server, macOS, ARM64]

    name: server-metal (${{ matrix.wf_name }})
    strategy:
      matrix:
        build_type: [Release]
        wf_name: ["GPUx1"]
        include:
          - build_type: Release
            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
            wf_name:    "GPUx1, backend-sampling"
          - build_type: Release
            extra_args: "GGML_METAL_DEVICES=2"
            wf_name:    "GPUx2"
          - build_type: Release
            extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
            wf_name:    "GPUx2, backend-sampling"
      fail-fast: false

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: Setup Node.js
        uses: actions/setup-node@v6
        with:
          node-version: "24"
          cache: "npm"
          cache-dependency-path: "tools/ui/package-lock.json"

      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server

      - name: Tests
        id: server_integration_tests
        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
        run: |
          cd tools/server/tests
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"

  # TODO: provision CUDA runner
  #  server-cuda:
  #    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
  #
  #    name: server-cuda (${{ matrix.wf_name }})
  #    strategy:
  #      matrix:
  #        build_type: [Release]
  #        wf_name: ["GPUx1"]
  #        include:
  #          - build_type: Release
  #            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
  #            wf_name:    "GPUx1, backend-sampling"
  #      fail-fast: false
  #
  #    steps:
  #      - name: Clone
  #        id: checkout
  #        uses: actions/checkout@v6
  #        with:
  #          fetch-depth: 0
  #          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
  #
  #      - name: Build
  #        id: cmake_build
  #        run: |
  #          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
  #          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
  #
  #      - name: Tests
  #        id: server_integration_tests
  #        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
  #        run: |
  #          cd tools/server/tests
  #          python3 -m venv venv
  #          source venv/bin/activate
  #          pip install -r requirements.txt
  #          export ${{ matrix.extra_args }}
  #          pytest -v -x -m "not slow"