package speechkit import ( "context" "fmt" "os" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/metadata" stt "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3" "github.com/yandex-cloud/go-genproto/yandex/cloud/operation" ) const ( SpeechKitEndpoint = "stt.api.cloud.yandex.net:443" ) type SpeechKitService struct { conn *grpc.ClientConn sttClient stt.AsyncRecognizerClient opClient operation.OperationServiceClient apiKey string folderID string } func NewSpeechKitService() (*SpeechKitService, error) { apiKey := os.Getenv("YANDEX_CLOUD_API_KEY") folderID := os.Getenv("YANDEX_CLOUD_FOLDER_ID") if apiKey == "" || folderID == "" { return nil, fmt.Errorf("missing required Yandex Cloud environment variables") } // Создаем защищенное соединение creds := credentials.NewTLS(nil) conn, err := grpc.NewClient(SpeechKitEndpoint, grpc.WithTransportCredentials(creds)) if err != nil { return nil, fmt.Errorf("failed to connect to SpeechKit: %w", err) } sttClient := stt.NewAsyncRecognizerClient(conn) opClient := operation.NewOperationServiceClient(conn) return &SpeechKitService{ conn: conn, sttClient: sttClient, opClient: opClient, apiKey: apiKey, folderID: folderID, }, nil } func (s *SpeechKitService) Close() error { return s.conn.Close() } // RecognizeFileFromS3 запускает асинхронное распознавание файла из S3 func (s *SpeechKitService) RecognizeFileFromS3(s3URI string) (string, error) { ctx := context.Background() // Добавляем авторизацию в контекст ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey) // Создаем запрос на распознавание req := &stt.RecognizeFileRequest{ AudioSource: &stt.RecognizeFileRequest_Uri{ Uri: s3URI, }, RecognitionModel: &stt.RecognitionModelOptions{ Model: "general", // Используем общую модель AudioFormat: &stt.AudioFormatOptions{ AudioFormat: &stt.AudioFormatOptions_ContainerAudio{ ContainerAudio: &stt.ContainerAudio{ ContainerAudioType: stt.ContainerAudio_OGG_OPUS, }, }, }, TextNormalization: &stt.TextNormalizationOptions{ TextNormalization: stt.TextNormalizationOptions_TEXT_NORMALIZATION_ENABLED, ProfanityFilter: false, LiteratureText: true, }, AudioProcessingType: stt.RecognitionModelOptions_FULL_DATA, }, SpeakerLabeling: &stt.SpeakerLabelingOptions{ SpeakerLabeling: stt.SpeakerLabelingOptions_SPEAKER_LABELING_ENABLED, }, } // Отправляем запрос op, err := s.sttClient.RecognizeFile(ctx, req) if err != nil { return "", fmt.Errorf("failed to start recognition: %w", err) } return op.Id, nil } // GetRecognitionResult получает результат распознавания по ID операции func (s *SpeechKitService) GetRecognitionResult(operationID string) ([]*stt.StreamingResponse, error) { ctx := context.Background() // Добавляем авторизацию в контекст ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey) req := &stt.GetRecognitionRequest{ OperationId: operationID, } stream, err := s.sttClient.GetRecognition(ctx, req) if err != nil { return nil, fmt.Errorf("failed to get recognition stream: %w", err) } var responses []*stt.StreamingResponse for { resp, err := stream.Recv() if err != nil { if err.Error() == "EOF" { break } return nil, fmt.Errorf("failed to receive recognition response: %w", err) } responses = append(responses, resp) } return responses, nil } // CheckOperationStatus проверяет статус операции распознавания func (s *SpeechKitService) CheckOperationStatus(operationID string) (*operation.Operation, error) { ctx := context.Background() op, err := s.opClient.Get(ctx, &operation.GetOperationRequest{ OperationId: operationID, }) if err != nil { return nil, fmt.Errorf("failed to get operation status: %w", err) } return op, nil } // ExtractTranscriptionText извлекает текст из результатов распознавания func ExtractTranscriptionText(responses []*stt.StreamingResponse) string { var fullText string for _, resp := range responses { if final := resp.GetFinal(); final != nil { for _, alt := range final.Alternatives { fullText += alt.Text + " " } } } return fullText }