Merge branch 'master' into concedo

# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # Makefile # README.md # main.cpp
2026-05-08 18:30:50 +00:00 · 2023-03-22 22:31:45 +08:00 · 2023-03-22 22:31:45 +08:00 · 86c7457e24
commit 86c7457e24
parent 5c475503ce ae44e23ee3
25 changed files with 3028 additions and 1944 deletions
--- a/expose.cpp
+++ b/expose.cpp
@ -39,31 +39,42 @@ extern "C" {
        char text[16384]; //16kb should be enough for any response
    };

-    gpt_params api_params;
-    gpt_vocab api_vocab;
-    llama_model api_model;    
-    int api_n_past = 0;
-    gpt_vocab::id old_embd_id = -1;
-    std::vector<float> api_logits;
-    std::vector<gpt_vocab::id> last_n_tokens;
-    size_t mem_per_token = 0;
    bool legacy_format = false;
+    llama_context_params ctx_params;
+    gpt_params params;
+    int n_past = 0;
+    llama_token old_embd_id = -1;
+    int n_threads = 4;
+    int n_batch = 8;
+    std::string model;
+    llama_context * ctx;
+    std::vector<llama_token> last_n_tokens;

    bool load_model(const load_model_inputs inputs)
    {
-        api_params.n_threads = inputs.threads;
-        api_params.n_ctx = inputs.max_context_length;
-        api_params.n_batch = inputs.batch_size;
-        api_params.model = inputs.model_filename;
+        ctx_params = llama_context_default_params();

-        int n_parts_overwrite =  inputs.n_parts_overwrite;
+        n_threads = inputs.threads;       
+        n_batch = inputs.batch_size;
+        model = inputs.model_filename;        

-        int loadresult = llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, GGML_TYPE_F16, n_parts_overwrite);
-        if (!loadresult) {  
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, api_params.model.c_str());
+        ctx_params.n_ctx      = inputs.max_context_length;
+        ctx_params.n_parts    = inputs.n_parts_overwrite;
+        ctx_params.seed       = -1;
+        ctx_params.f16_kv     = true;
+        ctx_params.logits_all = false;
+
+        ctx = llama_init_from_file(model.c_str(), ctx_params);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, model.c_str());
            return false;
        }
-        legacy_format = (loadresult==2?true:false);
+
+        //return val: 0=fail, 1=newformat, 2=legacy
+        int fileformat = check_file_format(model.c_str());        
+        
+        legacy_format = (fileformat==1?true:false);
        if(legacy_format)
        {
            printf("\n---\nWarning: Your model is using an OUTDATED format. Please reconvert it for better results!\n");
@ -74,69 +85,76 @@ extern "C" {

    generation_outputs generate(const generation_inputs inputs, generation_outputs & output)
    {
-        api_params.prompt = inputs.prompt;
-        api_params.seed = inputs.seed;
-        api_params.n_predict = inputs.max_length;
-        api_params.top_k = inputs.top_k;
-        api_params.top_p = inputs.top_p;
-        api_params.temp = inputs.temperature;
-        api_params.repeat_last_n = inputs.rep_pen_range;
-        api_params.repeat_penalty = inputs.rep_pen;
-        api_params.n_ctx = inputs.max_context_length;
+        params.prompt = inputs.prompt;
+        params.seed = inputs.seed;
+        params.n_predict = inputs.max_length;
+        params.top_k = inputs.top_k;
+        params.top_p = inputs.top_p;
+        params.temp = inputs.temperature;
+        params.repeat_last_n = inputs.rep_pen_range;
+        params.repeat_penalty = inputs.rep_pen;
+        params.n_ctx = inputs.max_context_length;
+        params.n_batch = n_batch;
+        params.n_threads = n_threads;

        bool reset_state = inputs.reset_state;
-        if(api_n_past==0)
+        if(n_past==0)
        {
            reset_state = true;
        }
      
-        if(api_params.repeat_last_n<1)
+        if(params.repeat_last_n<1)
        {
-            api_params.repeat_last_n = 1;
+            params.repeat_last_n = 1;
        }
-        if(api_params.top_k<1)
+        if(params.top_k<1)
        {
-            api_params.top_k = 300; //to disable top_k we actually need to increase this value to a very high number
+            params.top_k = 300; //to disable top_k we actually need to increase this value to a very high number
        }
-        if (api_params.seed < 0)
+        if (params.seed <= 0)
        {
-            api_params.seed = time(NULL);
+            params.seed = time(NULL);
        }

+		if(reset_state)
+		{
+			params.prompt.insert(0, 1, ' ');
+		}
+
+	    // tokenize the prompt
+ 		std::vector<llama_token> embd_inp;
+		if(legacy_format)
+        {
+            embd_inp = ::legacy_llama_tokenize(ctx, params.prompt, true);
+        }else{
+            embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+        }
+
+ 		//params.n_predict = std::min(params.n_predict, params.n_ctx - (int) embd_inp.size());
+        //truncate to front of the prompt if its too long
+        if (embd_inp.size() + params.n_predict > params.n_ctx) {
+            int offset = embd_inp.size() - params.n_ctx + params.n_predict;
+            embd_inp = std::vector<llama_token>(embd_inp.begin() + offset, embd_inp.end());
+        }	   
+   		std::vector<llama_token> embd;
+
+		int last_n_size = params.repeat_last_n;
+    	last_n_tokens.resize(last_n_size);
+
        //display usage
        // std::string tst = " ";
        // char * tst2 = (char*)tst.c_str();
-        // gpt_print_usage(1,&tst2,api_params);
-        
-        if(reset_state)
+        // gpt_print_usage(1,&tst2,params);
+
+		if(reset_state)
        {
-            api_params.prompt.insert(0, 1, ' ');
-        }
-        // tokenize the prompt
-        std::vector<gpt_vocab::id> embd_inp;
-        if(legacy_format)
-        {
-            embd_inp = ::legacy_llama_tokenize(api_vocab, api_params.prompt, true);
-        }else{
-            embd_inp = ::llama_tokenize(api_vocab, api_params.prompt, true);
-        }
-         
-        //api_params.n_predict = std::min(api_params.n_predict, api_model.hparams.n_ctx - (int)embd_inp.size());
-        //truncate to front of the prompt if its too long
-        if (embd_inp.size() + api_params.n_predict > api_model.hparams.n_ctx) {
-            int offset = embd_inp.size() - api_model.hparams.n_ctx + api_params.n_predict;
-            embd_inp = std::vector<gpt_vocab::id>(embd_inp.begin() + offset, embd_inp.end());
-        }
-        std::vector<gpt_vocab::id> embd;
-        
-        int last_n_size = api_params.repeat_last_n;
-        last_n_tokens.resize(last_n_size);
-        if(reset_state)
-        {
-            llama_eval(api_model, api_params.n_threads, 0, {0, 1, 2, 3}, api_logits, mem_per_token);
+			const std::vector<llama_token> tmp = { 0, 1, 2, 3 };
+	        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);            
            std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-            api_n_past = 0;
-        }else{
+            n_past = 0;
+        }
+        else
+        {
            //strip out the reset token (1) at the start of the embedding
            if(embd_inp.size()>0)
            {
@ -147,96 +165,97 @@ extern "C" {
                embd.push_back(old_embd_id);
            }
        }
-        
-        int remaining_tokens = api_params.n_predict;
-        int input_consumed = 0;
-        std::mt19937 api_rng(api_params.seed);
-        std::string concat_output = "";        
-       
-        bool startedsampling = false;
+		
+ 		int remaining_tokens = params.n_predict;
+		int input_consumed = 0;
+    	std::mt19937 rng(params.seed);   
+		std::string concat_output = "";  
+    	
+		bool startedsampling = false;
        printf("\nProcessing Prompt: ");
-        while (remaining_tokens > 0)
-        {
-            gpt_vocab::id id = 0;
-            // predict
-            if (embd.size() > 0)
-            {
+
+		while (remaining_tokens > 0) 
+		{
+			llama_token id = 0;
+	        // predict
+	        if (embd.size() > 0) 
+			{
+				printf("|");
                // for (auto i: embd) {                    
                //     std::cout << i << ',';
                // }
-                //printf("\nnp:%d embd:%d mem:%d",api_n_past,embd.size(),mem_per_token);
-                printf("|");
-                if (!llama_eval(api_model, api_params.n_threads, api_n_past, embd, api_logits, mem_per_token))
-                {
-                    fprintf(stderr, "Failed to predict\n");
+                // printf("\nnp:%d embd:%d",n_past,embd.size());
+	            if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) 
+				{
+	                fprintf(stderr, "Failed to predict\n");
                    snprintf(output.text, sizeof(output.text), "%s", "");
                    output.status = 0;
                    return output;
-                }
-            }
+	            }
+	        }

-            api_n_past += embd.size();
-            embd.clear();            
-            if (embd_inp.size() <= input_consumed)
-            {
-                // out of user input, sample next token
-                const float top_k = api_params.top_k;
-                const float top_p = api_params.top_p;
-                const float temp = api_params.temp;
-                const float repeat_penalty = api_params.repeat_penalty;
-                const int n_vocab = api_model.hparams.n_vocab;
-                
-                if(!startedsampling)
+        	n_past += embd.size();
+       		embd.clear();
+        	if ((int) embd_inp.size() <= input_consumed) 
+			{
+	            // out of user input, sample next token
+	            const float top_k          = params.top_k;
+	            const float top_p          = params.top_p;
+	            const float temp           = params.temp;
+	            const float repeat_penalty = params.repeat_penalty;
+
+            	if(!startedsampling)
                {
                    startedsampling = true;
                    printf("\nGenerating: ");
                }

-                {
-                    // set the logit of the eos token (2) to zero to avoid sampling it
-                    api_logits[api_logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
-                    //set logits of opening square bracket to zero.
-                    api_logits[api_logits.size() - n_vocab + 518] = 0;
-                    api_logits[api_logits.size() - n_vocab + 29961] = 0;
+	            {
+	                auto logits = llama_get_logits(ctx);
+					// set the logit of the eos token (2) to zero to avoid sampling it
+	                logits[llama_token_eos()] = 0;
+					//set logits of opening square bracket to zero.
+					logits[518] = 0;
+					logits[29961] = 0;
+	
+	                id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty);
+	
+	                last_n_tokens.erase(last_n_tokens.begin());
+	                last_n_tokens.push_back(id);
+	            }
+
+	            // add it to the context
+				old_embd_id = id;
+	            embd.push_back(id);


-                    id = llama_sample_top_p_top_k(api_vocab, api_logits.data() + (api_logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, api_rng);
+	            // decrement remaining sampling budget
+	            --remaining_tokens;
+                //printf("\nid:%d word:%s\n",id,llama_token_to_str(ctx, id));
+				concat_output += llama_token_to_str(ctx, id);
+        	} 
+			else 
+			{
+	            // some user input remains from prompt or interaction, forward it to processing
+	            while ((int) embd_inp.size() > input_consumed) 
+				{
+					old_embd_id = embd_inp[input_consumed];
+	                embd.push_back(embd_inp[input_consumed]);
+	                last_n_tokens.erase(last_n_tokens.begin());
+	                last_n_tokens.push_back(embd_inp[input_consumed]);
+	                ++input_consumed;
+	                if ((int) embd.size() >= params.n_batch) 
+					{
+	                    break;
+	                }
+            	}
+        	}

-                    last_n_tokens.erase(last_n_tokens.begin());
-                    last_n_tokens.push_back(id);
-                }
-
-                // add it to the context
-                old_embd_id = id;
-                embd.push_back(id);
-
-                // decrement remaining sampling budget
-                --remaining_tokens;
-                //printf("\nid:%d word:%s\n",id,api_vocab.id_to_token[id].c_str());
-                concat_output += api_vocab.id_to_token[id].c_str();
-            }
-            else
-            {
-                // some user input remains from prompt or interaction, forward it to processing
-                while (embd_inp.size() > input_consumed)
-                {
-                    old_embd_id = embd_inp[input_consumed];
-                    embd.push_back(embd_inp[input_consumed]);
-                    last_n_tokens.erase(last_n_tokens.begin());
-                    last_n_tokens.push_back(embd_inp[input_consumed]);
-                    ++input_consumed;
-                    if (embd.size() > api_params.n_batch)
-                    {
-                        break;
-                    }
-                }
-            }
-            
-        }
-
-        //printf("output: %s",concat_output.c_str());
-        output.status = 1;
+		}
+       		
+		output.status = 1;
        snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
        return output;
+
    }
 }