package yandex import ( "context" "fmt" "strings" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/metadata" stt "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3" "github.com/yandex-cloud/go-genproto/yandex/cloud/operation" ) const ( SpeechKitEndpoint = "stt.api.cloud.yandex.net:443" OperationEndpoint = "operation.api.cloud.yandex.net:443" RecognitionModel = "deferred-general" ) type speechKitConfig struct { ApiKey string FolderID string } type speechKitService struct { sttConn *grpc.ClientConn opConn *grpc.ClientConn sttClient stt.AsyncRecognizerClient opClient operation.OperationServiceClient apiKey string folderID string } func newSpeechKitService(cfg speechKitConfig) (*speechKitService, error) { apiKey := cfg.ApiKey folderID := cfg.FolderID if apiKey == "" || folderID == "" { return nil, fmt.Errorf("missing required Yandex Cloud environment variables") } // Создаем защищенное соединение для SpeechKit creds := credentials.NewTLS(nil) sttConn, err := grpc.NewClient(SpeechKitEndpoint, grpc.WithTransportCredentials(creds)) if err != nil { return nil, fmt.Errorf("failed to connect to SpeechKit: %w", err) } // Создаем защищенное соединение для Operations API opConn, err := grpc.NewClient(OperationEndpoint, grpc.WithTransportCredentials(creds)) if err != nil { sttConn.Close() return nil, fmt.Errorf("failed to connect to Operations API: %w", err) } sttClient := stt.NewAsyncRecognizerClient(sttConn) opClient := operation.NewOperationServiceClient(opConn) return &speechKitService{ sttConn: sttConn, opConn: opConn, sttClient: sttClient, opClient: opClient, apiKey: apiKey, folderID: folderID, }, nil } func (s *speechKitService) Close() error { var err1, err2 error if s.sttConn != nil { err1 = s.sttConn.Close() } if s.opConn != nil { err2 = s.opConn.Close() } if err1 != nil { return err1 } return err2 } // recognizeFileFromS3 запускает асинхронное распознавание файла из S3 func (s *speechKitService) recognizeFileFromS3(s3URI string) (string, error) { ctx := context.Background() // Добавляем авторизацию и folder_id в контекст ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey) ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID) // Создаем запрос на распознавание req := &stt.RecognizeFileRequest{ AudioSource: &stt.RecognizeFileRequest_Uri{ Uri: s3URI, }, RecognitionModel: &stt.RecognitionModelOptions{ Model: RecognitionModel, AudioFormat: &stt.AudioFormatOptions{ AudioFormat: &stt.AudioFormatOptions_ContainerAudio{ ContainerAudio: &stt.ContainerAudio{ ContainerAudioType: stt.ContainerAudio_OGG_OPUS, }, }, }, TextNormalization: &stt.TextNormalizationOptions{ TextNormalization: stt.TextNormalizationOptions_TEXT_NORMALIZATION_ENABLED, ProfanityFilter: false, LiteratureText: true, }, AudioProcessingType: stt.RecognitionModelOptions_FULL_DATA, }, SpeakerLabeling: &stt.SpeakerLabelingOptions{ SpeakerLabeling: stt.SpeakerLabelingOptions_SPEAKER_LABELING_ENABLED, }, } // Отправляем запрос op, err := s.sttClient.RecognizeFile(ctx, req) if err != nil { return "", fmt.Errorf("failed to start recognition: %w", err) } return op.Id, nil } // GetRecognitionResult получает результат распознавания по ID операции func (s *speechKitService) getRecognitionText(operationID string) (string, error) { ctx := context.Background() // Добавляем авторизацию и folder_id в контекст ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey) ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID) req := &stt.GetRecognitionRequest{ OperationId: operationID, } stream, err := s.sttClient.GetRecognition(ctx, req) if err != nil { return "", fmt.Errorf("failed to get recognition stream: %w", err) } var sb strings.Builder for { resp, err := stream.Recv() if err != nil { if err.Error() == "EOF" { break } return "", fmt.Errorf("failed to receive recognition response: %w", err) } if refinement := resp.GetFinalRefinement(); refinement != nil { if text := refinement.GetNormalizedText(); text != nil { for _, alt := range text.Alternatives { sb.WriteString(alt.Text) sb.WriteString(" ") } } } } return sb.String(), nil } // checkOperationStatus проверяет статус операции распознавания func (s *speechKitService) checkOperationStatus(operationID string) (*operation.Operation, error) { ctx := context.Background() ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey) ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID) op, err := s.opClient.Get(ctx, &operation.GetOperationRequest{ OperationId: operationID, }) if err != nil { return nil, fmt.Errorf("failed to get operation status: %w", err) } return op, nil }