kvcache-ai-ktransformers/ktransformers/server/backend/interfaces/exllamav2.py

import sys, os
from typing import AsyncIterator, Dict, Tuple

import torch

from ..args import ConfigArgs, default_args

from ..base import BackendInterfaceBase, ThreadContext
from ktransformers.server.schemas.assistants.runs import RunObject


from ..args import *

class ExllamaThreadContext(ThreadContext):
    def __init__(self, run: RunObject, args: ConfigArgs = default_args) -> None:
        super().__init__(run,args)

    def get_interface(self):
        return

    def get_local_messages(self):
        raise NotImplementedError


class ExllamaInterface(BackendInterfaceBase):

    def __init__(self, args: ConfigArgs = ...):
        raise NotImplementedError

    def tokenize_prompt(self, prompt: str) -> torch.Tensor:
        raise NotImplementedError

    async def inference(self,local_messages,request_unique_id:Optional[str])->AsyncIterator:
        raise NotImplementedError