Custom Models

This guide shows how to extend local_llm_kit to support custom models.

Adding Support for Custom Models

local_llm_kit is designed to be extensible, allowing you to add support for custom models and backends.

Custom Tokenizer

You can implement a custom tokenizer by creating a class that implements the necessary encoding and decoding methods:

from local_llm_kit import LLMClient
from local_llm_kit.backends.base import BaseTokenizer

class MyCustomTokenizer(BaseTokenizer):
    def __init__(self, model_path=None):
        super().__init__()
        # Initialize your tokenizer here
        # This could use a pretrained tokenizer or your own implementation
        self.model_path = model_path

    def encode(self, text):
        # Convert text to token IDs
        # Return a list of token IDs
        pass

    def decode(self, token_ids):
        # Convert token IDs back to text
        # Return a string
        pass

    def get_vocab_size(self):
        # Return the vocabulary size of your tokenizer
        pass

# Use your custom tokenizer
client = LLMClient(
    model="custom-model",
    tokenizer=MyCustomTokenizer(model_path="/path/to/tokenizer"),
    model_path="/path/to/model/weights"
)

Custom Backend

For more advanced customization, you can implement a custom backend:

from local_llm_kit.backends.base import BaseBackend

class MyCustomBackend(BaseBackend):
    def __init__(self, model_path, **kwargs):
        super().__init__()
        # Initialize your model here
        self.model_path = model_path
        # Load your model or set up your inference engine

    def generate(self, prompt, max_tokens=100, temperature=0.7, **kwargs):
        # Implement the generation logic for your model
        # Return a string containing the generated text
        pass

    def get_prompt_tokens(self, prompt):
        # Return the number of tokens in the prompt
        pass

    def get_completion_tokens(self, completion):
        # Return the number of tokens in the completion
        pass

# Register your custom backend
from local_llm_kit.llm import LLM

LLM.register_backend("my-custom-backend", MyCustomBackend)

# Use your custom backend
client = LLMClient(
    model="custom-model",
    backend="my-custom-backend",
    model_path="/path/to/model/weights"
)

Custom Prompt Formatting

You can also define custom prompt templates for your models:

from local_llm_kit.prompt_formatting import register_prompt_formatter

def my_custom_formatter(messages, add_generation_prompt=True):
    """
    Format chat messages for a custom model architecture.
    """
    formatted_prompt = ""

    for message in messages:
        role = message["role"]
        content = message["content"]

        if role == "system":
            formatted_prompt += f"<|system|>\n{content}\n"
        elif role == "user":
            formatted_prompt += f"<|user|>\n{content}\n"
        elif role == "assistant":
            formatted_prompt += f"<|assistant|>\n{content}\n"
        elif role == "function":
            formatted_prompt += f"<|function|>\n{content}\n"

    if add_generation_prompt:
        formatted_prompt += "<|assistant|>\n"

    return formatted_prompt

# Register your custom formatter
register_prompt_formatter("my-custom-model", my_custom_formatter)

# Use your custom formatter
client = LLMClient(
    model="my-custom-model",
    # Other parameters...
)

Example: Integrating with vLLM

Here’s an example of integrating with the vLLM inference engine:

from local_llm_kit.backends.base import BaseBackend

class VLLMBackend(BaseBackend):
    def __init__(self, model_path, **kwargs):
        super().__init__()

        # Import vLLM here to avoid making it a hard dependency
        from vllm import LLM

        # Initialize vLLM engine
        self.engine = LLM(
            model=model_path,
            tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
            gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9),
            # Other vLLM parameters...
        )

    def generate(self, prompt, max_tokens=100, temperature=0.7, **kwargs):
        from vllm import SamplingParams

        # Set up sampling parameters
        sampling_params = SamplingParams(
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=kwargs.get("top_p", 1.0),
            # Other sampling parameters...
        )

        # Generate text with vLLM
        outputs = self.engine.generate(prompt, sampling_params)

        # Extract generated text
        generated_text = outputs[0].outputs[0].text

        return generated_text

# Register vLLM backend
from local_llm_kit.llm import LLM

LLM.register_backend("vllm", VLLMBackend)

# Use vLLM backend
client = LLMClient(
    model="llama2",
    backend="vllm",
    model_path="meta-llama/Llama-2-70b-chat-hf",
    tensor_parallel_size=4  # For multi-GPU inference
)