transcriber/internal/service/speechkit/speechkit.go

package speechkit

import (
	"context"
	"fmt"
	"os"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials"
	"google.golang.org/grpc/metadata"

	stt "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3"
	"github.com/yandex-cloud/go-genproto/yandex/cloud/operation"
)

const (
	SpeechKitEndpoint = "stt.api.cloud.yandex.net:443"
)

type SpeechKitService struct {
	conn      *grpc.ClientConn
	sttClient stt.AsyncRecognizerClient
	opClient  operation.OperationServiceClient
	apiKey    string
	folderID  string
}

func NewSpeechKitService() (*SpeechKitService, error) {
	apiKey := os.Getenv("YANDEX_CLOUD_API_KEY")
	folderID := os.Getenv("YANDEX_CLOUD_FOLDER_ID")

	if apiKey == "" || folderID == "" {
		return nil, fmt.Errorf("missing required Yandex Cloud environment variables")
	}

	// Создаем защищенное соединение
	creds := credentials.NewTLS(nil)
	conn, err := grpc.NewClient(SpeechKitEndpoint, grpc.WithTransportCredentials(creds))
	if err != nil {
		return nil, fmt.Errorf("failed to connect to SpeechKit: %w", err)
	}

	sttClient := stt.NewAsyncRecognizerClient(conn)
	opClient := operation.NewOperationServiceClient(conn)

	return &SpeechKitService{
		conn:      conn,
		sttClient: sttClient,
		opClient:  opClient,
		apiKey:    apiKey,
		folderID:  folderID,
	}, nil
}

func (s *SpeechKitService) Close() error {
	return s.conn.Close()
}

// RecognizeFileFromS3 запускает асинхронное распознавание файла из S3
func (s *SpeechKitService) RecognizeFileFromS3(s3URI string) (string, error) {
	ctx := context.Background()

	// Добавляем авторизацию в контекст
	ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)

	// Создаем запрос на распознавание
	req := &stt.RecognizeFileRequest{
		AudioSource: &stt.RecognizeFileRequest_Uri{
			Uri: s3URI,
		},
		RecognitionModel: &stt.RecognitionModelOptions{
			Model: "general", // Используем общую модель
			AudioFormat: &stt.AudioFormatOptions{
				AudioFormat: &stt.AudioFormatOptions_ContainerAudio{
					ContainerAudio: &stt.ContainerAudio{
						ContainerAudioType: stt.ContainerAudio_OGG_OPUS,
					},
				},
			},
			TextNormalization: &stt.TextNormalizationOptions{
				TextNormalization: stt.TextNormalizationOptions_TEXT_NORMALIZATION_ENABLED,
				ProfanityFilter:   false,
				LiteratureText:    true,
			},
			AudioProcessingType: stt.RecognitionModelOptions_FULL_DATA,
		},
		SpeakerLabeling: &stt.SpeakerLabelingOptions{
			SpeakerLabeling: stt.SpeakerLabelingOptions_SPEAKER_LABELING_ENABLED,
		},
	}

	// Отправляем запрос
	op, err := s.sttClient.RecognizeFile(ctx, req)
	if err != nil {
		return "", fmt.Errorf("failed to start recognition: %w", err)
	}

	return op.Id, nil
}

// GetRecognitionResult получает результат распознавания по ID операции
func (s *SpeechKitService) GetRecognitionResult(operationID string) ([]*stt.StreamingResponse, error) {
	ctx := context.Background()

	// Добавляем авторизацию в контекст
	ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)

	req := &stt.GetRecognitionRequest{
		OperationId: operationID,
	}

	stream, err := s.sttClient.GetRecognition(ctx, req)
	if err != nil {
		return nil, fmt.Errorf("failed to get recognition stream: %w", err)
	}

	var responses []*stt.StreamingResponse
	for {
		resp, err := stream.Recv()
		if err != nil {
			if err.Error() == "EOF" {
				break
			}
			return nil, fmt.Errorf("failed to receive recognition response: %w", err)
		}
		responses = append(responses, resp)
	}

	return responses, nil
}

// CheckOperationStatus проверяет статус операции распознавания
func (s *SpeechKitService) CheckOperationStatus(operationID string) (*operation.Operation, error) {
	ctx := context.Background()

	op, err := s.opClient.Get(ctx, &operation.GetOperationRequest{
		OperationId: operationID,
	})

	if err != nil {
		return nil, fmt.Errorf("failed to get operation status: %w", err)
	}

	return op, nil
}

// ExtractTranscriptionText извлекает текст из результатов распознавания
func ExtractTranscriptionText(responses []*stt.StreamingResponse) string {
	var fullText string

	for _, resp := range responses {
		if final := resp.GetFinal(); final != nil {
			for _, alt := range final.Alternatives {
				fullText += alt.Text + " "
			}
		}
	}

	return fullText
}