prima.cpp/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
Peter Sugihara afd997ab60
llama.swiftui : fix infinite loop, ouput timings, buff UI (#4674)
* fix infinite loop

* slight UI simplification, clearer UX

* clearer UI text, add timings to completion log
2023-12-29 15:58:56 +02:00

100 lines
3 KiB
Swift

import Foundation
@MainActor
class LlamaState: ObservableObject {
@Published var messageLog = ""
@Published var cacheCleared = false
let NS_PER_S = 1_000_000_000.0
private var llamaContext: LlamaContext?
private var defaultModelUrl: URL? {
Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
}
init() {
do {
try loadModel(modelUrl: defaultModelUrl)
} catch {
messageLog += "Error!\n"
}
}
func loadModel(modelUrl: URL?) throws {
if let modelUrl {
messageLog += "Loading model...\n"
llamaContext = try LlamaContext.create_context(path: modelUrl.path())
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
} else {
messageLog += "Load a model from the list below\n"
}
}
func complete(text: String) async {
guard let llamaContext else {
return
}
let t_start = DispatchTime.now().uptimeNanoseconds
await llamaContext.completion_init(text: text)
let t_heat_end = DispatchTime.now().uptimeNanoseconds
let t_heat = Double(t_heat_end - t_start) / NS_PER_S
messageLog += "\(text)"
while await llamaContext.n_cur < llamaContext.n_len {
let result = await llamaContext.completion_loop()
messageLog += "\(result)"
}
let t_end = DispatchTime.now().uptimeNanoseconds
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
await llamaContext.clear()
messageLog += """
\n
Done
Heat up took \(t_heat)s
Generated \(tokens_per_second) t/s\n
"""
}
func bench() async {
guard let llamaContext else {
return
}
messageLog += "\n"
messageLog += "Running benchmark...\n"
messageLog += "Model info: "
messageLog += await llamaContext.model_info() + "\n"
let t_start = DispatchTime.now().uptimeNanoseconds
let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
let t_end = DispatchTime.now().uptimeNanoseconds
let t_heat = Double(t_end - t_start) / NS_PER_S
messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
// if more than 5 seconds, then we're probably running on a slow device
if t_heat > 5.0 {
messageLog += "Heat up time is too long, aborting benchmark\n"
return
}
let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
messageLog += "\(result)"
messageLog += "\n"
}
func clear() async {
guard let llamaContext else {
return
}
await llamaContext.clear()
messageLog = ""
}
}