koboldcpp/otherarch/ttscpp/src/ttssampler.cpp

#include "ttssampler.h"

void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
    // assume that we are pointing to the start of the first token output;
    if (!do_sample) {
        return max(logits, output_tokens);
    }
    std::vector<uint32_t> max_vals;
    // the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or
    // equal to top_p;
    std::vector<float> max_head_probs;

    // This allows us to perform an effective softmax without logarithms or big number calculations.
    // Additionally by avoiding large number division we drastically improve the stability of
    // our softmax implementation;
    max(logits, max_vals);

    std::vector<std::vector<size_t>> picks;
    bool use_nucleus_sampling = false;
    bool performed_softmax = false;

    if (top_p < 1.0) {
        // if we are nucleus sampling via top-p then we need to perform softmax over the samples before getting top_k samples, so that we don't trim beyond top_p.
        // Otherwise, if we are not performing top-p sampling then it is more efficient to perform softmax after getting the top_k nucleus.
        softmax(logits, picks, max_vals);
        performed_softmax = true;
    }
    if (top_k > 0 && top_k < vocab_size) {
        picks = topk(logits, performed_softmax);
        use_nucleus_sampling = true;
    }

    if (top_p >= 1.0) {
        softmax(logits, picks, max_vals);
        performed_softmax = true;
    }

    if (top_p < 1.0) {
        topp(logits, picks, max_head_probs);
        use_nucleus_sampling = true;
    }

    bool has_repetition_penalty = repetition_penalty != 1.0;
    if (has_repetition_penalty && (last_token_ids.size() == 0 || repetition_counts.size() == 0)) {
        reset();
    }
    std::minstd_rand gen(std::random_device{}());
    std::uniform_real_distribution<float> dist(0.0f, 1.0f);
    for (int i = 0; i < n_output_heads; i++) {
        float assignment =  top_p < 1.0 ? dist(gen) * max_head_probs[i] : dist(gen);
        float cumulative = 0.0f;
        for (uint32_t j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) {
            int ii = use_nucleus_sampling ? (int) picks[i][j] : j;
            cumulative += *(logits+(i*vocab_size+ii));
            // with top_k and top_p it is possible for the assignment to be greater than the cumulative value
            if (assignment <= cumulative || ii >= vocab_size + 1 || j >= picks[i].size() - 1) {
                if (has_repetition_penalty) {
                    if (last_token_ids[i] != ii) {
                        repetition_counts[i] = 0;
                    }
                    last_token_ids[i] = ii;
                    repetition_counts[i] += 1;
                }
                output_tokens.push_back(ii);
                break;
            }
        }
    }
}

void sampler::reset() {
    if (repetition_penalty != 1.0) {
        last_token_ids.clear();
        repetition_counts.clear();
        for (int i = 0; i < n_output_heads; i++) {
            last_token_ids.push_back(-1);
            repetition_counts.push_back(0);
        }
    }
}

void sampler::softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices) {
    bool use_nucleus_sampling = picks.size() > 0;
    bool has_repetition_penalty = repetition_penalty != 1.0f;
    bool has_temperature = temperature != 1.0f;
    for (int i = 0; i < n_output_heads; i++) {
        float cumsum = 0.0;
        float max_val = logits[i*vocab_size + max_indices[i]];
        if (has_repetition_penalty && last_token_ids[i] == max_indices[i]) {
            max_val /= (pow(repetition_penalty, repetition_counts[i]));
        }
        if (has_temperature) {
            max_val /= temperature;
        }
        for (int j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) {
            int ii = use_nucleus_sampling ? (int) picks[i][j] : j;
            int index = i * vocab_size + ii;
            float v = *(logits + index);
            if (has_repetition_penalty && last_token_ids[i] == ii) {
                v /= (pow(repetition_penalty, repetition_counts[i]));
            }
            if (has_temperature) {
                v /= temperature;
            }
            v = expf(v - max_val);
            cumsum += v;
            logits[index] = v;
        }
        for (int j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) {
            int ii = use_nucleus_sampling ? picks[i][j] : j;
            int index = i * vocab_size + ii;
            float v = *(logits + index);
            logits[index] = v / cumsum;
        }
    }
}

void sampler::topp(float * logits, std::vector<std::vector<size_t>> & picks, std::vector<float> & max_head_probs) {
    if (picks.empty()) {
        // we need to get the softmaxed logits ordered
        for (int i = 0; i < n_output_heads; i++) {
            std::vector<size_t> head_picks(vocab_size);
            iota(head_picks.begin(), head_picks.end(), 0);
            // have to sort with repetition penalty applied so not to inavertently trim our nucleus size.
            std::sort(head_picks.begin(), head_picks.end(), [&logits, &i, this](size_t s1, size_t s2) {
                float v1 = logits[i*vocab_size+s1];
                float v2 = logits[i*vocab_size+s2];
                return v1 > v2;
            });

            picks.push_back(head_picks);
        }
    }
    // if we didn't already perform topk or if the probable sum of topk logits is greater than top_p then we need to trim.
    for (int i = 0; i < n_output_heads; i++) {
        float prob_sum = 0.0f;
        int trim_to = -1;
        for (int ii = 0; ii < picks[i].size(); ii++) {
            prob_sum += logits[i*vocab_size+picks[i][ii]];
            if (prob_sum >= top_p) {
                trim_to = ii+1;
                break;
            }
        }
        max_head_probs.push_back(std::min(prob_sum, top_p));
        if (trim_to > 0) {
            picks[i] = std::vector<size_t>(picks[i].begin(), picks[i].begin()+trim_to);
        }
    }
}

std::vector<std::vector<size_t>> sampler::topk(float * logits, bool performed_softmax) {
    bool has_repetition_penalty = repetition_penalty != 1.0f;
    std::vector<std::vector<size_t>> head_picks;
    if (vocab_size < top_k) {
        // technically we should never get here, but lets be protective.
        for (int i = 0; i < n_output_heads; i++) {
            std::vector<size_t> picks(vocab_size);
            iota(picks.begin(), picks.end(), 0);
            head_picks.push_back(picks);
        }
        return head_picks;
    }
    for (int i = 0; i < n_output_heads; i++) {
        std::vector<size_t> picks(vocab_size);
        iota(picks.begin(), picks.end(), 0);
        // have to sort with repetition penalty applied so not to inavertently trim our nucleus size.
        std::sort(picks.begin(), picks.end(), [&logits, &i, &has_repetition_penalty, &performed_softmax, this](size_t s1, size_t s2) {
            float v1 = logits[i*vocab_size+s1];
            float v2 = logits[i*vocab_size+s2];
            if (!performed_softmax) {
                if (has_repetition_penalty && last_token_ids[i] == s1) {
                    v1 /= (pow(repetition_penalty, repetition_counts[i]));
                } else if (has_repetition_penalty && last_token_ids[i] == s2) {
                    v2 /= (pow(repetition_penalty, repetition_counts[i]));
                }
            }
            return v1 > v2;
        });
        head_picks.push_back(std::vector<size_t>(picks.begin(), picks.begin() + top_k));
    }
    return head_picks;
}

void sampler::max(float * logits, std::vector<uint32_t> & output_tokens) {
    bool has_repetition_penalty = repetition_penalty != 1.0f;
    for (int i = 0; i < n_output_heads; i++) {
        float max = -INFINITY;
        uint32_t token_id = 0;
        for (uint32_t ii = 0; ii < vocab_size; ii++) {
            float v = *(logits+i*vocab_size+ii);
            // while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of
            // the softmax function in which case it is possible for repetition counts to be set.
            if (has_repetition_penalty && last_token_ids[i] == ii) {
                v /= (pow(repetition_penalty, repetition_counts[i]));
            }
            if (v > max) {
                max = v;
                token_id = ii;
            }
        }
        output_tokens.push_back(token_id);
    }
}