Merge commit 'df270ef745' into concedo_experimental

# Conflicts: # Makefile # common/CMakeLists.txt # common/common.h # common/sampling.cpp # common/sampling.h # examples/infill/infill.cpp # examples/llama-bench/llama-bench.cpp # examples/quantize-stats/quantize-stats.cpp # examples/server/server.cpp # include/llama.h # src/llama-sampling.cpp # src/llama-sampling.h # src/llama.cpp # tests/test-grammar-integration.cpp # tests/test-grammar-parser.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-llama-grammar.cpp # tests/test-sampling.cpp
2025-09-11 17:44:38 +00:00 · 2024-09-09 17:10:08 +08:00 · 2024-09-09 17:10:08 +08:00 · 12fd16bfd4
commit 12fd16bfd4
parent 70cdb55cc9 df270ef745
86 changed files with 3406 additions and 7795 deletions
--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@ -1,12 +0,0 @@
-# llama.cpp/examples/llama.swiftui
-
-Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
-point for more advanced projects.
-
-For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
-
-![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
-
-Video demonstration:
-
-https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
+    private var sampling: UnsafeMutablePointer<llama_sampler>
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    var is_done: Bool = false
@ -42,9 +43,15 @@ actor LlamaContext {
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
        self.temporary_invalid_cchars = []
+        let sparams = llama_sampler_chain_default_params()
+        self.sampling = llama_sampler_chain_init(sparams)
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
    }

    deinit {
+        llama_sampler_free(sampling)
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
@ -69,7 +76,6 @@ actor LlamaContext {
        print("Using \(n_threads) threads")

        var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
        ctx_params.n_threads       = Int32(n_threads)
        ctx_params.n_threads_batch = Int32(n_threads)
@ -144,20 +150,9 @@ actor LlamaContext {
    func completion_loop() -> String {
        var new_token_id: llama_token = 0

-        let n_vocab = llama_n_vocab(model)
-        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
+        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        var candidates = Array<llama_token_data>()
-        candidates.reserveCapacity(Int(n_vocab))
-
-        for token_id in 0..<n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-        candidates.withUnsafeMutableBufferPointer() { buffer in
-            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
-
-            new_token_id = llama_sample_token_greedy(context, &candidates_p)
-        }
+        llama_sampler_accept(sampling, new_token_id)

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")