ProxyAI/src/main/java/ee/carlrobert/codegpt/completions/llama/LlamaModel.java

package ee.carlrobert.codegpt.completions.llama;

import static java.lang.String.format;
import static java.util.stream.Collectors.toSet;

import ee.carlrobert.codegpt.codecompletions.InfillPromptTemplate;
import ee.carlrobert.codegpt.completions.HuggingFaceModel;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import org.jetbrains.annotations.NotNull;

public enum LlamaModel {
  CODE_LLAMA(
      "Code Llama",
      "Code Llama is a family of large language models for code based on Llama 2 "
          + "providing state-of-the-art performance among open models, infilling capabilities, "
          + "support for large input contexts, and zero-shot instruction following ability for "
          + "programming tasks.",
      PromptTemplate.LLAMA,
      InfillPromptTemplate.CODE_LLAMA,
      List.of(
          HuggingFaceModel.CODE_LLAMA_7B_Q3,
          HuggingFaceModel.CODE_LLAMA_7B_Q4,
          HuggingFaceModel.CODE_LLAMA_7B_Q5,
          HuggingFaceModel.CODE_LLAMA_13B_Q3,
          HuggingFaceModel.CODE_LLAMA_13B_Q4,
          HuggingFaceModel.CODE_LLAMA_13B_Q5,
          HuggingFaceModel.CODE_LLAMA_34B_Q3,
          HuggingFaceModel.CODE_LLAMA_34B_Q4,
          HuggingFaceModel.CODE_LLAMA_34B_Q5)
  ),
  CODE_BOOGA(
      "CodeBooga",
      "CodeBooga is a high-performing code instruct model created by merging two existing"
          + " code models: "
          + "<ol><li>Phind-CodeLlama-34B-v2</li><li>WizardCoder-Python-34B-V1.0</li></ol>",
      PromptTemplate.ALPACA,
      List.of(
          HuggingFaceModel.CODE_BOOGA_34B_Q3,
          HuggingFaceModel.CODE_BOOGA_34B_Q4,
          HuggingFaceModel.CODE_BOOGA_34B_Q5)),
  DEEPSEEK_CODER(
      "Deepseek Coder",
      "Deepseek Coder is composed of a series of code language models, each trained "
          + "from scratch on 2T tokens, with a composition of 87% code and 13% natural language "
          + "in both English and Chinese. It achieves state-of-the-art performance among "
          + "open-source code models on multiple programming languages and various benchmarks.",
      PromptTemplate.ALPACA,
      InfillPromptTemplate.DEEPSEEK_CODER,
      List.of(
          HuggingFaceModel.DEEPSEEK_CODER_1_3B_Q3,
          HuggingFaceModel.DEEPSEEK_CODER_1_3B_Q4,
          HuggingFaceModel.DEEPSEEK_CODER_1_3B_Q5,
          HuggingFaceModel.DEEPSEEK_CODER_6_7B_Q3,
          HuggingFaceModel.DEEPSEEK_CODER_6_7B_Q4,
          HuggingFaceModel.DEEPSEEK_CODER_6_7B_Q5,
          HuggingFaceModel.DEEPSEEK_CODER_33B_Q3,
          HuggingFaceModel.DEEPSEEK_CODER_33B_Q4,
          HuggingFaceModel.DEEPSEEK_CODER_33B_Q5)),
  DEEPSEEK_R1(
      "Deepseek R1",
      "DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) "
          + "without supervised fine-tuning (SFT) as a preliminary step, demonstrated remarkable "
          + "performance on reasoning. DeepSeek-R1 achieves performance comparable to OpenAI-o1 "
          + "across math, code, and reasoning tasks.",
      PromptTemplate.DEEPSEEK_R1,
      InfillPromptTemplate.DEEPSEEK_CODER,
      List.of(
          HuggingFaceModel.DEEPSEEK_R1_1_5B_Q6,
          HuggingFaceModel.DEEPSEEK_R1_7B_Q4,
          HuggingFaceModel.DEEPSEEK_R1_7B_Q6,
          HuggingFaceModel.DEEPSEEK_R1_14B_Q4,
          HuggingFaceModel.DEEPSEEK_R1_14B_Q6)),
  PHIND_CODE_LLAMA(
      "Phind Code Llama",
      "This model is fine-tuned from Phind-CodeLlama-34B-v1 on an additional 1.5B tokens "
          + "high-quality programming-related data, achieving 73.8% pass@1 on HumanEval. "
          + "It's the current state-of-the-art amongst open-source models.",
      PromptTemplate.ALPACA,
      List.of(
          HuggingFaceModel.PHIND_CODE_LLAMA_34B_Q3,
          HuggingFaceModel.PHIND_CODE_LLAMA_34B_Q4,
          HuggingFaceModel.PHIND_CODE_LLAMA_34B_Q5)),
  WIZARD_CODER_PYTHON(
      "WizardCoder - Python",
      "WizardCoder, a Code Evol-Instruct fine-tuned Code LLM, which achieves "
          + "the 73.2 pass@1 and surpasses GPT4 (2023/03/15), ChatGPT-3.5, "
          + "and Claude2 on the HumanEval Benchmarks.",
      PromptTemplate.ALPACA,
      List.of(
          HuggingFaceModel.WIZARD_CODER_PYTHON_7B_Q3,
          HuggingFaceModel.WIZARD_CODER_PYTHON_7B_Q4,
          HuggingFaceModel.WIZARD_CODER_PYTHON_7B_Q5,
          HuggingFaceModel.WIZARD_CODER_PYTHON_13B_Q3,
          HuggingFaceModel.WIZARD_CODER_PYTHON_13B_Q4,
          HuggingFaceModel.WIZARD_CODER_PYTHON_13B_Q5,
          HuggingFaceModel.WIZARD_CODER_PYTHON_34B_Q3,
          HuggingFaceModel.WIZARD_CODER_PYTHON_34B_Q4,
          HuggingFaceModel.WIZARD_CODER_PYTHON_34B_Q5)),
  LLAMA_3(
      "Llama 3",
      "Llama 3 is a family of large language models (LLMs), a collection of pretrained and "
          + "instruction tuned generative text models in 8 and 70B sizes. The Llama 3 instruction "
          + "tuned models are optimized for dialogue use cases and outperform many of the available"
          + " open source chat models on common industry benchmarks. Further, in developing these "
          + "models, we took great care to optimize helpfulness and safety.",
      PromptTemplate.LLAMA_3,
      List.of(
          HuggingFaceModel.LLAMA_3_8B_IQ3_M,
          HuggingFaceModel.LLAMA_3_8B_Q4_K_M,
          HuggingFaceModel.LLAMA_3_8B_Q5_K_M,
          HuggingFaceModel.LLAMA_3_8B_Q6_K,
          HuggingFaceModel.LLAMA_3_8B_Q8_0,
          HuggingFaceModel.LLAMA_3_70B_IQ1,
          HuggingFaceModel.LLAMA_3_70B_IQ2_XS,
          HuggingFaceModel.LLAMA_3_70B_Q4_K_M)),
  PHI_3(
      "Phi-3 Mini",
      "Phi-3 Mini is a 3.8B parameters, lightweight, state-of-the-art open model. "
          + "When assessed against benchmarks testing common sense, language understanding, math, "
          + "code, long context and logical reasoning, Phi-3 Mini-4K-Instruct showcased a robust "
          + "and state-of-the-art performance among models with less than 13 billion parameters.",
      PromptTemplate.PHI_3,
      List.of(
          HuggingFaceModel.PHI_3_3_8B_4K_IQ4_NL,
          HuggingFaceModel.PHI_3_3_8B_4K_Q5_K_M,
          HuggingFaceModel.PHI_3_3_8B_4K_Q6_K,
          HuggingFaceModel.PHI_3_3_8B_4K_Q8_0,
          HuggingFaceModel.PHI_3_3_8B_4K_FP16)),
  PHI_3_MEDIUM(
      "Phi-3 Medium 128K", """
      The Phi-3-Medium-128K-Instruct is a 14B parameters, lightweight, state-of-the-art open model \
      trained with the Phi-3 datasets that includes both synthetic data and the filtered publicly \
      available websites data with a focus on high-quality and reasoning dense properties. \
      The model has underwent a post-training process that incorporates both supervised fine-tuning\
       and direct preference optimization for the instruction following and safety measures. \
      When assessed against benchmarks testing common sense, language understanding, math, code, \
      long context and logical reasoning, Phi-3-Medium-128K-Instruct showcased a robust and \
      state-of-the-art performance among models of the same-size and next-size-up.""",
      PromptTemplate.PHI_3,
      List.of(
          HuggingFaceModel.PHI_3_14B_128K_IQ3_M,
          HuggingFaceModel.PHI_3_14B_128K_Q3_K_M,
          HuggingFaceModel.PHI_3_14B_128K_IQ4_NL,
          HuggingFaceModel.PHI_3_14B_128K_Q4_K_M,
          HuggingFaceModel.PHI_3_14B_128K_Q5_K_M,
          HuggingFaceModel.PHI_3_14B_128K_Q6_K,
          HuggingFaceModel.PHI_3_14B_128K_Q8_0)),
  CODE_GEMMA(
      "CodeGemma Instruct",
      "CodeGemma Instruct is the first in a series of coding models released by Google. "
          + "As an instruct model, it specializes in being asked coding related questions, but can "
          + "also function as an autocomplete/fill-in-middle model for tools like co-pilot.\n"
          + "This model is perfect for general coding questions or code generation.",
      PromptTemplate.CODE_GEMMA,
      InfillPromptTemplate.CODE_GEMMA,
      List.of(
          HuggingFaceModel.CODE_GEMMA_7B_Q3_K_L,
          HuggingFaceModel.CODE_GEMMA_7B_Q4_K_M,
          HuggingFaceModel.CODE_GEMMA_7B_Q5_K_M,
          HuggingFaceModel.CODE_GEMMA_7B_Q6_K,
          HuggingFaceModel.CODE_GEMMA_7B_Q8_0)),
  CODE_QWEN(
      "CodeQwen1.5", """
          A specialized codeLLM built upon the Qwen1.5 language model. \
          CodeQwen1.5-7B has been pretrained with around 3 trillion tokens of code-related data. \
          It supports an extensive repertoire of 92 programming languages, and it exhibits \
          exceptional capacity in long-context understanding and generation with the ability to \
          process information of 64K tokens. In terms of performance, CodeQwen1.5 demonstrates \
          impressive capabilities in basic code generation, long-context modelling, code editing \
          and SQL. We believe this model can significantly enhance developer productivity and \
          streamline software development workflows within diverse technological environments.""",
      PromptTemplate.CODE_QWEN,
      InfillPromptTemplate.CODE_QWEN,
      List.of(
          HuggingFaceModel.CODE_QWEN_1_5_7B_Q3_K_M,
          HuggingFaceModel.CODE_QWEN_1_5_7B_Q4_K_M,
          HuggingFaceModel.CODE_QWEN_1_5_7B_Q5_K_M,
          HuggingFaceModel.CODE_QWEN_1_5_7B_Q6_K)),
  CODE_QWEN2_5_CODER(
      "CodeQwen2.5 Coder", """
          Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models \
          (formerly known as CodeQwen).
          It brings the following improvements upon CodeQwen1.5:

          - Significantly improvements in code generation, code reasoning and code fixing. \
          Base on the strong Qwen2.5, we scale up the training tokens into 5.5 trillion including \
          source code, text-code grounding, Synthetic data, etc.
          - A more comprehensive foundation for real-world applications such as Code Agents. \
          Not only enhancing coding capabilities but also maintaining its strengths in \
          mathematics and general competencies.
          - Long-context Support up to 128K tokens.
          """,
      PromptTemplate.CODE_QWEN,
      InfillPromptTemplate.CODE_QWEN_2_5,
      List.of(
          HuggingFaceModel.CODE_QWEN_2_5_1_5B_Q6_K,
          HuggingFaceModel.CODE_QWEN_2_5_1_5B_Q8_0,
          HuggingFaceModel.CODE_QWEN_2_5_3B_Q4_K_M,
          HuggingFaceModel.CODE_QWEN_2_5_3B_Q6_K,
          HuggingFaceModel.CODE_QWEN_2_5_3B_Q8_0,
          HuggingFaceModel.CODE_QWEN_2_5_7B_Q4_K_M,
          HuggingFaceModel.CODE_QWEN_2_5_7B_Q6_K,
          HuggingFaceModel.CODE_QWEN_2_5_7B_Q8_0,
          HuggingFaceModel.CODE_QWEN_2_5_14B_Q4_K_M,
          HuggingFaceModel.CODE_QWEN_2_5_14B_Q6_K,
          HuggingFaceModel.CODE_QWEN_2_5_14B_Q8_0,
          HuggingFaceModel.CODE_QWEN_2_5_32B_Q4_K_M,
          HuggingFaceModel.CODE_QWEN_2_5_32B_Q6_K,
          HuggingFaceModel.CODE_QWEN_2_5_32B_Q8_0)),
  STABLE_CODE(
      "Stable Code Instruct", """
      stable-code-instruct-3b is a 2.7B billion parameter decoder-only language model tuned from \
      stable-code-3b. This model was trained on a mix of publicly available datasets, synthetic \
      datasets using Direct Preference Optimization (DPO).
      This instruct tune demonstrates state-of-the-art performance (compared to models of similar \
      size) on the MultiPL-E metrics across multiple programming languages tested using BigCode's \
      Evaluation Harness, and on the code portions of MT Bench. The model is fine tuned to make it \
      usable in tasks like general purpose Code/Software Engineering like conversations and \
      SQL related generation and conversation.""",
      PromptTemplate.STABLE_CODE,
      InfillPromptTemplate.CODE_QWEN,
      List.of(
          HuggingFaceModel.STABLE_CODE_3B_Q3_K_M,
          HuggingFaceModel.STABLE_CODE_3B_Q4_K_M,
          HuggingFaceModel.STABLE_CODE_3B_Q5_K_M,
          HuggingFaceModel.STABLE_CODE_3B_Q6_K,
          HuggingFaceModel.STABLE_CODE_3B_Q8_0)),
  CODESTRAL(
      "Codestral", """
      Codestral is an open-weight generative AI model explicitly designed for code generation \
      tasks. It helps developers write and interact with code through a shared instruction and \
      completion API endpoint. As it masters code and English, it can be used to design advanced \
      AI applications for software developers. Codestral is trained on a diverse dataset of 80+ \
      programming languages. Codestral saves developers time and effort: it can complete coding \
      functions, write tests, and complete any partial code using a fill-in-the-middle mechanism. \
      Interacting with Codestral will help level up the developer’s coding game and reduce the \
      risk of errors and bugs.""",
      PromptTemplate.MIXTRAL_INSTRUCT,
      InfillPromptTemplate.CODESTRAL,
      List.of(
          HuggingFaceModel.CODESTRAL_22B_32K_Q3_K_M,
          HuggingFaceModel.CODESTRAL_22B_32K_Q4_K_M,
          HuggingFaceModel.CODESTRAL_22B_32K_Q5_K_M,
          HuggingFaceModel.CODESTRAL_22B_32K_Q6_K,
          HuggingFaceModel.CODESTRAL_22B_32K_Q8_0)),
  ;

  private final String label;
  private final String description;
  private final PromptTemplate promptTemplate;
  private final InfillPromptTemplate infillPromptTemplate;
  private final List<HuggingFaceModel> huggingFaceModels;

  LlamaModel(
      String label,
      String description,
      PromptTemplate promptTemplate,
      List<HuggingFaceModel> huggingFaceModels) {
    this(label, description, promptTemplate, null, huggingFaceModels);
  }

  LlamaModel(
      String label,
      String description,
      PromptTemplate promptTemplate,
      InfillPromptTemplate infillPromptTemplate,
      List<HuggingFaceModel> huggingFaceModels) {
    this.label = label;
    this.description = description;
    this.promptTemplate = promptTemplate;
    this.infillPromptTemplate = infillPromptTemplate;
    this.huggingFaceModels = huggingFaceModels;
  }

  public static @NotNull LlamaModel findByHuggingFaceModel(HuggingFaceModel huggingFaceModel) {
    return Arrays.stream(LlamaModel.values())
            .filter(model -> model.getHuggingFaceModels().contains(huggingFaceModel))
            .findFirst()
            .orElseThrow(() -> new RuntimeException("Unable to find correct LLM"));
  }

  public @NotNull List<HuggingFaceModel> filterSelectedModelsBySize(ModelSize selectedModelSize) {
    return selectedModelSize != null ? getHuggingFaceModels().stream()
            .filter(model -> selectedModelSize.size() == model.getParameterSize())
            .toList() : List.of();
  }

  public boolean anyDownloaded() {
    return huggingFaceModels.stream().anyMatch(HuggingFaceModel::isDownloaded);
  }

  public String getDownloadedMarker() {
    return getDownloadedMarker(anyDownloaded());
  }

  public static String getDownloadedMarker(boolean downloaded) {
    return downloaded ? "✓" : "\u2001";
  }

  public static @NotNull Path getLlamaModelsPath() {
    return Paths.get(System.getProperty("user.home"), ".codegpt/models/gguf");
  }

  @Override
  public String toString() {
    return String.join(" ", getDownloadedMarker(), label, getFormattedModelSizeRange());
  }

  /**
   * Server started: {@code CodeLlama 7B 4-bit}.
   */
  public @NotNull String toString(@NotNull HuggingFaceModel hfm) {
    return "%s %dB %d-bit".formatted(label, hfm.getParameterSize(), hfm.getQuantization());
  }

  public String getLabel() {
    return label;
  }

  public String getDescription() {
    return description;
  }

  public PromptTemplate getPromptTemplate() {
    return promptTemplate;
  }

  public InfillPromptTemplate getInfillPromptTemplate() {
    return infillPromptTemplate;
  }

  public List<HuggingFaceModel> getHuggingFaceModels() {
    return huggingFaceModels;
  }

  /**
   * Downloaded model with the biggest parameter size, otherwise first.
   */
  public HuggingFaceModel getLastExistingModelOrFirst() {
    return huggingFaceModels.stream()
            .filter(HuggingFaceModel::isDownloaded)
            .max(Comparator.comparing(HuggingFaceModel::getParameterSize))
            .orElse(huggingFaceModels.get(0));
  }

  public String getFormattedModelSizeRange() {
    var parameters = huggingFaceModels.stream()
        .map(HuggingFaceModel::getParameterSize)
        .collect(toSet());
    if (parameters.size() == 1) {
      return parameters.iterator().next() + "B";
    }
    return format("(%dB - %dB)", Collections.min(parameters), Collections.max(parameters));
  }

  public List<ModelSize> getSortedUniqueModelSizes() {
    return huggingFaceModels.stream()
            .map(hfm -> new ModelSize(hfm.getParameterSize(), hfm.isDownloaded()))
            .sorted()
            .collect(LinkedHashSet::new, ModelSize.skipSameSize(), Set::addAll)
            .stream().toList();
  }

  public static List<LlamaModel> getSorted() {
    return Arrays.stream(values()).sorted(Comparator.comparing(Enum::name)).toList();
  }

  public record ModelSize(int size, boolean downloaded) implements Comparable<ModelSize> {
    // Sort by size, but downloaded comes first: [  7B, ✓ 13B,   13B,   34B]
    private static final Comparator<ModelSize> sizeDownloadedFirst = Comparator
            .comparing(ModelSize::size)
            .thenComparing(Comparator.comparing(ModelSize::downloaded).reversed());

    @Override
    public int compareTo(@NotNull ModelSize other) {
      return sizeDownloadedFirst.compare(this, other);
    }

    private static @NotNull BiConsumer<Set<ModelSize>, ModelSize> skipSameSize() {
      return (s, e) -> {
        if (s.stream().noneMatch(v -> v.size == e.size)) {
          s.add(e);
        }
      };
    }

    @Override
    public String toString() {
      return "%s %dB".formatted(getDownloadedMarker(downloaded), size);
    }

  }
}