transcriber/internal/adapter/speechkit/speechkit.go

package speechkit

import (
	"context"
	"fmt"
	"os"
	"strings"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials"
	"google.golang.org/grpc/metadata"

	stt "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3"
	"github.com/yandex-cloud/go-genproto/yandex/cloud/operation"
)

const (
	SpeechKitEndpoint = "stt.api.cloud.yandex.net:443"
	OperationEndpoint = "operation.api.cloud.yandex.net:443"

	RecognitionModel = "deferred-general"
)

type SpeechKitService struct {
	sttConn   *grpc.ClientConn
	opConn    *grpc.ClientConn
	sttClient stt.AsyncRecognizerClient
	opClient  operation.OperationServiceClient
	apiKey    string
	folderID  string
}

func NewSpeechKitService() (*SpeechKitService, error) {
	apiKey := os.Getenv("YANDEX_CLOUD_API_KEY")
	folderID := os.Getenv("YANDEX_CLOUD_FOLDER_ID")

	if apiKey == "" || folderID == "" {
		return nil, fmt.Errorf("missing required Yandex Cloud environment variables")
	}

	// Создаем защищенное соединение для SpeechKit
	creds := credentials.NewTLS(nil)
	sttConn, err := grpc.NewClient(SpeechKitEndpoint, grpc.WithTransportCredentials(creds))
	if err != nil {
		return nil, fmt.Errorf("failed to connect to SpeechKit: %w", err)
	}

	// Создаем защищенное соединение для Operations API
	opConn, err := grpc.NewClient(OperationEndpoint, grpc.WithTransportCredentials(creds))
	if err != nil {
		sttConn.Close()
		return nil, fmt.Errorf("failed to connect to Operations API: %w", err)
	}

	sttClient := stt.NewAsyncRecognizerClient(sttConn)
	opClient := operation.NewOperationServiceClient(opConn)

	return &SpeechKitService{
		sttConn:   sttConn,
		opConn:    opConn,
		sttClient: sttClient,
		opClient:  opClient,
		apiKey:    apiKey,
		folderID:  folderID,
	}, nil
}

func (s *SpeechKitService) Close() error {
	var err1, err2 error
	if s.sttConn != nil {
		err1 = s.sttConn.Close()
	}
	if s.opConn != nil {
		err2 = s.opConn.Close()
	}
	if err1 != nil {
		return err1
	}
	return err2
}

// RecognizeFileFromS3 запускает асинхронное распознавание файла из S3
func (s *SpeechKitService) RecognizeFileFromS3(s3URI string) (string, error) {
	ctx := context.Background()

	// Добавляем авторизацию и folder_id в контекст
	ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
	ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)

	// Создаем запрос на распознавание
	req := &stt.RecognizeFileRequest{
		AudioSource: &stt.RecognizeFileRequest_Uri{
			Uri: s3URI,
		},
		RecognitionModel: &stt.RecognitionModelOptions{
			Model: RecognitionModel,
			AudioFormat: &stt.AudioFormatOptions{
				AudioFormat: &stt.AudioFormatOptions_ContainerAudio{
					ContainerAudio: &stt.ContainerAudio{
						ContainerAudioType: stt.ContainerAudio_OGG_OPUS,
					},
				},
			},
			TextNormalization: &stt.TextNormalizationOptions{
				TextNormalization: stt.TextNormalizationOptions_TEXT_NORMALIZATION_ENABLED,
				ProfanityFilter:   false,
				LiteratureText:    true,
			},
			AudioProcessingType: stt.RecognitionModelOptions_FULL_DATA,
		},
		SpeakerLabeling: &stt.SpeakerLabelingOptions{
			SpeakerLabeling: stt.SpeakerLabelingOptions_SPEAKER_LABELING_ENABLED,
		},
	}

	// Отправляем запрос
	op, err := s.sttClient.RecognizeFile(ctx, req)
	if err != nil {
		return "", fmt.Errorf("failed to start recognition: %w", err)
	}

	return op.Id, nil
}

// GetRecognitionResult получает результат распознавания по ID операции
func (s *SpeechKitService) GetRecognitionText(operationID string) (string, error) {
	ctx := context.Background()

	// Добавляем авторизацию и folder_id в контекст
	ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
	ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)

	req := &stt.GetRecognitionRequest{
		OperationId: operationID,
	}

	stream, err := s.sttClient.GetRecognition(ctx, req)
	if err != nil {
		return "", fmt.Errorf("failed to get recognition stream: %w", err)
	}

	var sb strings.Builder

	for {
		resp, err := stream.Recv()
		if err != nil {
			if err.Error() == "EOF" {
				break
			}
			return "", fmt.Errorf("failed to receive recognition response: %w", err)
		}
		if refinement := resp.GetFinalRefinement(); refinement != nil {
			if text := refinement.GetNormalizedText(); text != nil {
				for _, alt := range text.Alternatives {
					sb.WriteString(alt.Text)
					sb.WriteString(" ")
				}
			}
		}
	}

	return sb.String(), nil
}

// CheckOperationStatus проверяет статус операции распознавания
func (s *SpeechKitService) CheckOperationStatus(operationID string) (*operation.Operation, error) {
	ctx := context.Background()

	ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
	ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)

	op, err := s.opClient.Get(ctx, &operation.GetOperationRequest{
		OperationId: operationID,
	})

	if err != nil {
		return nil, fmt.Errorf("failed to get operation status: %w", err)
	}

	return op, nil
}