Add initial audio recognition requests

2025-08-11 15:26:55 +03:00
parent c1da998c02
commit 672d8573fc
9 changed files with 396 additions and 64 deletions
--- a/internal/service/speechkit/speechkit.go
+++ b/internal/service/speechkit/speechkit.go
@@ -0,0 +1,160 @@
+package speechkit
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials"
+	"google.golang.org/grpc/metadata"
+
+	stt "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3"
+	"github.com/yandex-cloud/go-genproto/yandex/cloud/operation"
+)
+
+const (
+	SpeechKitEndpoint = "stt.api.cloud.yandex.net:443"
+)
+
+type SpeechKitService struct {
+	conn      *grpc.ClientConn
+	sttClient stt.AsyncRecognizerClient
+	opClient  operation.OperationServiceClient
+	apiKey    string
+	folderID  string
+}
+
+func NewSpeechKitService() (*SpeechKitService, error) {
+	apiKey := os.Getenv("YANDEX_CLOUD_API_KEY")
+	folderID := os.Getenv("YANDEX_CLOUD_FOLDER_ID")
+
+	if apiKey == "" || folderID == "" {
+		return nil, fmt.Errorf("missing required Yandex Cloud environment variables")
+	}
+
+	// Создаем защищенное соединение
+	creds := credentials.NewTLS(nil)
+	conn, err := grpc.NewClient(SpeechKitEndpoint, grpc.WithTransportCredentials(creds))
+	if err != nil {
+		return nil, fmt.Errorf("failed to connect to SpeechKit: %w", err)
+	}
+
+	sttClient := stt.NewAsyncRecognizerClient(conn)
+	opClient := operation.NewOperationServiceClient(conn)
+
+	return &SpeechKitService{
+		conn:      conn,
+		sttClient: sttClient,
+		opClient:  opClient,
+		apiKey:    apiKey,
+		folderID:  folderID,
+	}, nil
+}
+
+func (s *SpeechKitService) Close() error {
+	return s.conn.Close()
+}
+
+// RecognizeFileFromS3 запускает асинхронное распознавание файла из S3
+func (s *SpeechKitService) RecognizeFileFromS3(s3URI string) (string, error) {
+	ctx := context.Background()
+
+	// Добавляем авторизацию в контекст
+	ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
+
+	// Создаем запрос на распознавание
+	req := &stt.RecognizeFileRequest{
+		AudioSource: &stt.RecognizeFileRequest_Uri{
+			Uri: s3URI,
+		},
+		RecognitionModel: &stt.RecognitionModelOptions{
+			Model: "general", // Используем общую модель
+			AudioFormat: &stt.AudioFormatOptions{
+				AudioFormat: &stt.AudioFormatOptions_ContainerAudio{
+					ContainerAudio: &stt.ContainerAudio{
+						ContainerAudioType: stt.ContainerAudio_OGG_OPUS,
+					},
+				},
+			},
+			TextNormalization: &stt.TextNormalizationOptions{
+				TextNormalization: stt.TextNormalizationOptions_TEXT_NORMALIZATION_ENABLED,
+				ProfanityFilter:   false,
+				LiteratureText:    true,
+			},
+			AudioProcessingType: stt.RecognitionModelOptions_FULL_DATA,
+		},
+		SpeakerLabeling: &stt.SpeakerLabelingOptions{
+			SpeakerLabeling: stt.SpeakerLabelingOptions_SPEAKER_LABELING_ENABLED,
+		},
+	}
+
+	// Отправляем запрос
+	op, err := s.sttClient.RecognizeFile(ctx, req)
+	if err != nil {
+		return "", fmt.Errorf("failed to start recognition: %w", err)
+	}
+
+	return op.Id, nil
+}
+
+// GetRecognitionResult получает результат распознавания по ID операции
+func (s *SpeechKitService) GetRecognitionResult(operationID string) ([]*stt.StreamingResponse, error) {
+	ctx := context.Background()
+
+	// Добавляем авторизацию в контекст
+	ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
+
+	req := &stt.GetRecognitionRequest{
+		OperationId: operationID,
+	}
+
+	stream, err := s.sttClient.GetRecognition(ctx, req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get recognition stream: %w", err)
+	}
+
+	var responses []*stt.StreamingResponse
+	for {
+		resp, err := stream.Recv()
+		if err != nil {
+			if err.Error() == "EOF" {
+				break
+			}
+			return nil, fmt.Errorf("failed to receive recognition response: %w", err)
+		}
+		responses = append(responses, resp)
+	}
+
+	return responses, nil
+}
+
+// CheckOperationStatus проверяет статус операции распознавания
+func (s *SpeechKitService) CheckOperationStatus(operationID string) (*operation.Operation, error) {
+	ctx := context.Background()
+
+	op, err := s.opClient.Get(ctx, &operation.GetOperationRequest{
+		OperationId: operationID,
+	})
+
+	if err != nil {
+		return nil, fmt.Errorf("failed to get operation status: %w", err)
+	}
+
+	return op, nil
+}
+
+// ExtractTranscriptionText извлекает текст из результатов распознавания
+func ExtractTranscriptionText(responses []*stt.StreamingResponse) string {
+	var fullText string
+
+	for _, resp := range responses {
+		if final := resp.GetFinal(); final != nil {
+			for _, alt := range final.Alternatives {
+				fullText += alt.Text + " "
+			}
+		}
+	}
+
+	return fullText
+}