jellybit/internal/llm/openai.go

package llm

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"log/slog"
	"net/http"
	"net/url"
	"strings"
	"time"
)

const (
	defaultTimeout  = 120 * time.Second
	maxAttempts     = 3 // первая попытка + 2 ретрая на транзиентных сбоях
	baseRetryWait   = 500 * time.Millisecond
	maxResponseBody = 8 << 20 // 8 MiB — потолок на тело ответа
)

// openAICompat — клиент OpenAI-совместимого Chat Completions API.
type openAICompat struct {
	endpoint  string // полный URL .../chat/completions
	hc        *http.Client
	apiKey    string
	model     string
	retryWait time.Duration // базовая пауза между ретраями (0 в тестах)
	log       *slog.Logger
}

// newOpenAICompat собирает клиент из конфига.
func newOpenAICompat(cfg Config, logger *slog.Logger) (*openAICompat, error) {
	if cfg.BaseURL == "" {
		return nil, fmt.Errorf("llm: empty base_url")
	}
	if cfg.Model == "" {
		return nil, fmt.Errorf("llm: empty model")
	}

	timeout := cfg.Timeout
	if timeout == 0 {
		timeout = defaultTimeout
	}

	transport := http.DefaultTransport
	if cfg.Proxy != "" {
		proxyURL, err := url.Parse(cfg.Proxy)
		if err != nil {
			return nil, fmt.Errorf("llm: parse proxy %q: %w", cfg.Proxy, err)
		}
		transport = &http.Transport{Proxy: http.ProxyURL(proxyURL)}
	}

	if logger == nil {
		logger = slog.Default()
	}
	return &openAICompat{
		endpoint:  strings.TrimRight(cfg.BaseURL, "/") + "/chat/completions",
		hc:        &http.Client{Timeout: timeout, Transport: transport},
		apiKey:    cfg.APIKey,
		model:     cfg.Model,
		retryWait: baseRetryWait,
		log:       logger,
	}, nil
}

// chatRequest — тело запроса Chat Completions.
type chatRequest struct {
	Model          string        `json:"model"`
	Messages       []chatMessage `json:"messages"`
	Temperature    *float64      `json:"temperature,omitempty"`
	MaxTokens      int           `json:"max_tokens,omitempty"`
	ResponseFormat *respFormat   `json:"response_format,omitempty"`
}

type chatMessage struct {
	Role    string `json:"role"`
	Content string `json:"content"`
}

type respFormat struct {
	Type string `json:"type"`
}

// chatResponse — нужное подмножество ответа Chat Completions.
type chatResponse struct {
	Model   string `json:"model"`
	Choices []struct {
		Message struct {
			Content string `json:"content"`
		} `json:"message"`
		FinishReason string `json:"finish_reason"`
	} `json:"choices"`
	Usage struct {
		PromptTokens     int     `json:"prompt_tokens"`
		CompletionTokens int     `json:"completion_tokens"`
		TotalTokens      int     `json:"total_tokens"`
		Cost             float64 `json:"cost"`
	} `json:"usage"`
	// Некоторые шлюзы кладут ошибку в тело при HTTP 200.
	Error *struct {
		Message string `json:"message"`
	} `json:"error"`
}

// Complete выполняет один вызов модели с транзиентными ретраями на сетевых
// сбоях, 429 и 5xx. 4xx (кроме 429) — постоянная ошибка, без ретраев.
func (c *openAICompat) Complete(ctx context.Context, req Request) (Response, error) {
	if len(req.Messages) == 0 {
		return Response{}, fmt.Errorf("llm: empty messages")
	}

	body, err := json.Marshal(c.buildRequest(req))
	if err != nil {
		return Response{}, fmt.Errorf("llm: marshal request: %w", err)
	}

	var lastErr error
	for attempt := 1; attempt <= maxAttempts; attempt++ {
		if attempt > 1 {
			if err := c.wait(ctx, attempt); err != nil {
				return Response{}, err
			}
		}

		c.log.Debug("llm: request",
			"endpoint", c.endpoint, "model", c.model,
			"attempt", attempt, "max_attempts", maxAttempts)
		start := time.Now()
		resp, retryable, err := c.do(ctx, body)
		if err == nil {
			c.log.Debug("llm: response ok",
				"model", resp.Model, "attempt", attempt,
				"duration", time.Since(start),
				"total_tokens", resp.Usage.TotalTokens, "cost", resp.Usage.Cost)
			return resp, nil
		}
		lastErr = err
		if !retryable {
			c.log.Error("llm: request failed (non-retryable)",
				"model", c.model, "attempt", attempt, "duration", time.Since(start), "err", err)
			return Response{}, err
		}
		c.log.Warn("llm: request failed, will retry",
			"model", c.model, "attempt", attempt, "max_attempts", maxAttempts,
			"duration", time.Since(start), "err", err)
	}
	c.log.Error("llm: all attempts exhausted",
		"model", c.model, "max_attempts", maxAttempts, "err", lastErr)
	return Response{}, fmt.Errorf("llm: exhausted %d attempts: %w", maxAttempts, lastErr)
}

func (c *openAICompat) buildRequest(req Request) chatRequest {
	msgs := make([]chatMessage, len(req.Messages))
	for i, m := range req.Messages {
		msgs[i] = chatMessage{Role: string(m.Role), Content: m.Content}
	}
	cr := chatRequest{
		Model:       c.model,
		Messages:    msgs,
		Temperature: req.Temperature,
		MaxTokens:   req.MaxTokens,
	}
	if req.JSONMode {
		cr.ResponseFormat = &respFormat{Type: "json_object"}
	}
	return cr
}

// do делает один HTTP-запрос. Второй результат — можно ли ретраить ошибку.
func (c *openAICompat) do(ctx context.Context, body []byte) (Response, bool, error) {
	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.endpoint, bytes.NewReader(body))
	if err != nil {
		return Response{}, false, fmt.Errorf("llm: build request: %w", err)
	}
	httpReq.Header.Set("Content-Type", "application/json")
	if c.apiKey != "" {
		httpReq.Header.Set("Authorization", "Bearer "+c.apiKey)
	}

	resp, err := c.hc.Do(httpReq)
	if err != nil {
		// Сетевой сбой — ретраибелен (ctx.Err проверит wait на следующем заходе).
		return Response{}, true, fmt.Errorf("llm: request: %w", err)
	}
	defer func() { _ = resp.Body.Close() }()

	raw, err := io.ReadAll(io.LimitReader(resp.Body, maxResponseBody))
	if err != nil {
		return Response{}, true, fmt.Errorf("llm: read body: %w", err)
	}

	if resp.StatusCode != http.StatusOK {
		retryable := resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500
		return Response{}, retryable, fmt.Errorf("llm: status %d: %s",
			resp.StatusCode, snippet(raw))
	}

	var cr chatResponse
	if err := json.Unmarshal(raw, &cr); err != nil {
		return Response{}, false, fmt.Errorf("llm: decode response: %w (body: %s)", err, snippet(raw))
	}
	if cr.Error != nil && cr.Error.Message != "" {
		return Response{}, false, fmt.Errorf("llm: provider error: %s", cr.Error.Message)
	}
	if len(cr.Choices) == 0 {
		return Response{}, false, fmt.Errorf("llm: empty choices (body: %s)", snippet(raw))
	}

	return Response{
		Content: cr.Choices[0].Message.Content,
		Model:   cr.Model,
		Usage: Usage{
			PromptTokens:     cr.Usage.PromptTokens,
			CompletionTokens: cr.Usage.CompletionTokens,
			TotalTokens:      cr.Usage.TotalTokens,
			Cost:             cr.Usage.Cost,
		},
	}, false, nil
}

// wait выдерживает паузу перед ретраем (линейный backoff), уважая ctx.
func (c *openAICompat) wait(ctx context.Context, attempt int) error {
	d := c.retryWait * time.Duration(attempt-1)
	if d <= 0 {
		return ctx.Err()
	}
	t := time.NewTimer(d)
	defer t.Stop()
	select {
	case <-ctx.Done():
		return fmt.Errorf("llm: %w", ctx.Err())
	case <-t.C:
		return nil
	}
}

// snippet обрезает тело для сообщения об ошибке.
func snippet(b []byte) string {
	const max = 300
	s := strings.TrimSpace(string(b))
	if len(s) > max {
		return s[:max] + "…"
	}
	return s
}