GGUF version of model

#4
by sparkhonyuk - opened

Hi,
Are you planning to release GGUF version of model (for usage with ollama, for example)?

(.venv) python3 ~/Sources/llama.cpp/convert_hf_to_gguf.py ./Giga-Embeddings-instruct --outtype bf16 
INFO:hf-to-gguf:Loading model: Giga-Embeddings-instruct
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00003.safetensors'
Traceback (most recent call last):
  File "/Users/danilabagroff/Sources/llama.cpp/convert_hf_to_gguf.py", line 5758, in <module>
    main()
  File "/Users/danilabagroff/Sources/llama.cpp/convert_hf_to_gguf.py", line 5752, in main
    model_instance.write()
  File "/Users/danilabagroff/Sources/llama.cpp/convert_hf_to_gguf.py", line 400, in write
    self.prepare_tensors()
  File "/Users/danilabagroff/Sources/llama.cpp/convert_hf_to_gguf.py", line 1846, in prepare_tensors
    super().prepare_tensors()
  File "/Users/danilabagroff/Sources/llama.cpp/convert_hf_to_gguf.py", line 276, in prepare_tensors
    for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/danilabagroff/Sources/llama.cpp/convert_hf_to_gguf.py", line 1814, in modify_tensors
    return [(self.map_tensor_name(name), data_torch)]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/danilabagroff/Sources/llama.cpp/convert_hf_to_gguf.py", line 235, in map_tensor_name
    raise ValueError(f"Can not map tensor {name!r}")
ValueError: Can not map tensor 'latent_attention_model.cross_attend_blocks.0.fn.to_kv.weight'

Поддерживаю, без GGUF (или совместимости с llama.cpp) крайне сложно найти практическое применение данной модели в каких-либо приложениях

Присоединяюсь. Стандартный конвертор на базе llama.cpp не знает такой архитектуры. Просьба к инженерам сбера подумать как в ГГУФ конвертировать.

Всем привет. ) как задеплоить у себя! юзаем GPUStack.
------- кастомный бэкенд😀 ------
backend_name: giga-embeddings-sbert-custom
docker_image_url: giga-embeddings-sbert-backend:latest
health_check_path: /health

version_configs:
Default:
image_name: giga-embeddings-sbert-backend:latest
custom_framework: cuda

run_command: '-m {{model_path}} --host 0.0.0.0 --port {{port}} --alias {{model_name}} --batch-size 32 --max-seq-length 4096'

environment_variables:
  CUDA_VISIBLE_DEVICES: "0"
  PYTHONUNBUFFERED: "1"

Билдим бэкенд на ноде😀 docker build -t giga-embeddings-sbert-backend:latest .
---- dockerfile -----
FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime

WORKDIR /app

RUN apt-get update && apt-get install -y python3-pip && rm -rf /var/lib/apt/lists/*

RUN pip install --no-cache-dir
torch==2.1.0
transformers==4.48.2
sentence-transformers==5.1.1
accelerate==0.32.1
einops==0.7.0
flask==3.0.0
flask-cors==4.0.0

COPY app.py /app/app.py

EXPOSE 8000

ENTRYPOINT ["python3", "/app/app.py"]

--- app.py ---- 😀
from flask import Flask, request, jsonify
from flask_cors import CORS
from sentence_transformers import SentenceTransformer
import torch
import logging
import os
import argparse

----------------------------

Logging

----------------------------

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(name)

----------------------------

Flask app

----------------------------

app = Flask(name)
CORS(app)

----------------------------

Global model

----------------------------

model = None

----------------------------

App config

----------------------------

APP_CONFIG = {
"model_path": None,
"host": "0.0.0.0",
"port": 8000,
"model_name": "Giga-Embeddings-instruct",
"batch_size": 32,
"max_seq_length": 4096,
}

----------------------------

CLI args

----------------------------

def parse_args():
parser = argparse.ArgumentParser(
description="Giga-Embeddings-instruct OpenAI-compatible embedding server"
)

parser.add_argument(
    "-m",
    "--model",
    "--model-path",
    dest="model_path",
    help="Path to model directory or HF model id",
)
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
    "--alias",
    "--model-name",
    dest="model_name",
    default="Giga-Embeddings-instruct",
)
parser.add_argument("--batch-size", type=int, default=32)
parser.add_argument("--max-seq-length", type=int, default=4096)

return parser.parse_args()

def init_config():
args = parse_args()

APP_CONFIG["model_path"] = args.model_path
APP_CONFIG["host"] = args.host
APP_CONFIG["port"] = args.port
APP_CONFIG["model_name"] = args.model_name
APP_CONFIG["batch_size"] = args.batch_size
APP_CONFIG["max_seq_length"] = args.max_seq_length

# Env override (GPUStack)
APP_CONFIG["model_path"] = os.getenv("MODEL_PATH", APP_CONFIG["model_path"])
APP_CONFIG["host"] = os.getenv("HOST", APP_CONFIG["host"])
APP_CONFIG["port"] = int(os.getenv("PORT", APP_CONFIG["port"]))
APP_CONFIG["model_name"] = os.getenv("MODEL_NAME", APP_CONFIG["model_name"])
APP_CONFIG["batch_size"] = int(os.getenv("BATCH_SIZE", APP_CONFIG["batch_size"]))
APP_CONFIG["max_seq_length"] = int(
    os.getenv("MAX_SEQ_LENGTH", APP_CONFIG["max_seq_length"])
)

logger.info(f"APP CONFIG: {APP_CONFIG}")

----------------------------

Model loading

----------------------------

def resolve_model_path() -> str:
if APP_CONFIG["model_path"]:
return APP_CONFIG["model_path"]

candidates = [
    "/var/lib/gpustack/cache/huggingface/ai-sage/Giga-Embeddings-instruct",
    "/var/lib/gpustack/models/giga-embeddings-instruct",
    "/models/giga-embeddings-instruct",
]

for p in candidates:
    if os.path.exists(p):
        logger.info(f"Found local model at {p}")
        return p

logger.info("Using HuggingFace model id")
return "ai-sage/Giga-Embeddings-instruct"

def load_model():
global model

if model is not None:
    return model

logger.info(f"Loading model {APP_CONFIG['model_name']}")

device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = resolve_model_path()

model = SentenceTransformer(
    model_path,
    device=device,
    trust_remote_code=True,
)

model.max_seq_length = APP_CONFIG["max_seq_length"]

logger.info(f"Model loaded on device: {model.device}")
logger.info(
    f"Embedding dimension: {model.get_sentence_embedding_dimension()}"
)
logger.info(f"Max seq length: {model.max_seq_length}")

return model

----------------------------

Routes

----------------------------

@app .route("/v1/embeddings", methods=["POST"])
def embeddings():
data = request.get_json(force=True)
inputs = data.get("input")

if isinstance(inputs, str):
    inputs = [inputs]

if not inputs:
    return jsonify({"error": "input is required"}), 400

model = load_model()

with torch.inference_mode():
    vectors = model.encode(
        inputs,
        batch_size=APP_CONFIG["batch_size"],
        convert_to_numpy=True,
        show_progress_bar=False,
    )

return jsonify(
    {
        "object": "list",
        "data": [
            {
                "object": "embedding",
                "index": i,
                "embedding": vec.tolist(),
            }
            for i, vec in enumerate(vectors)
        ],
        "model": APP_CONFIG["model_name"],
        "usage": {
            "prompt_tokens": sum(len(x.split()) for x in inputs),
            "total_tokens": sum(len(x.split()) for x in inputs),
        },
    }
)

@app .route("/v1/models", methods=["GET"])
def models():
model = load_model()
return jsonify(
{
"object": "list",
"data": [
{
"id": APP_CONFIG["model_name"],
"object": "model",
"owned_by": "ai-sage",
"embedding_dim": model.get_sentence_embedding_dimension(),
"context_length": APP_CONFIG["max_seq_length"],
}
],
}
)

@app .route("/health", methods=["GET"])
def health():
try:
model = load_model()
return jsonify(
{
"status": "healthy",
"model": APP_CONFIG["model_name"],
"device": str(model.device),
"cuda_available": torch.cuda.is_available(),
}
)
except Exception as e:
logger.exception("Health check failed")
return jsonify({"status": "unhealthy", "error": str(e)}), 500

@app .route("/info", methods=["GET"])
def info():
model = load_model()
return jsonify(
{
"model": APP_CONFIG["model_name"],
"embedding_dimension": model.get_sentence_embedding_dimension(),
"max_seq_length": APP_CONFIG["max_seq_length"],
"pooling_method": "Latent-Attention",
"device": str(model.device),
"cuda_available": torch.cuda.is_available(),
"torch_version": torch.version,
}
)

----------------------------

Main

----------------------------

if name == "main":
init_config()

logger.info("Starting Giga-Embeddings-instruct server")
logger.info(f"CUDA available: {torch.cuda.is_available()}")
logger.info(f"Listening on {APP_CONFIG['host']}:{APP_CONFIG['port']}")

app.run(
    host=APP_CONFIG["host"],
    port=APP_CONFIG["port"],
    debug=False,
    threaded=True,
)

Sign up or log in to comment