182 lines
5.1 KiB
Go
182 lines
5.1 KiB
Go
package speechkit
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
|
|
"google.golang.org/grpc"
|
|
"google.golang.org/grpc/credentials"
|
|
"google.golang.org/grpc/metadata"
|
|
|
|
stt "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3"
|
|
"github.com/yandex-cloud/go-genproto/yandex/cloud/operation"
|
|
)
|
|
|
|
const (
|
|
SpeechKitEndpoint = "stt.api.cloud.yandex.net:443"
|
|
OperationEndpoint = "operation.api.cloud.yandex.net:443"
|
|
|
|
RecognitionModel = "deferred-general"
|
|
)
|
|
|
|
type SpeechKitService struct {
|
|
sttConn *grpc.ClientConn
|
|
opConn *grpc.ClientConn
|
|
sttClient stt.AsyncRecognizerClient
|
|
opClient operation.OperationServiceClient
|
|
apiKey string
|
|
folderID string
|
|
}
|
|
|
|
func NewSpeechKitService() (*SpeechKitService, error) {
|
|
apiKey := os.Getenv("YANDEX_CLOUD_API_KEY")
|
|
folderID := os.Getenv("YANDEX_CLOUD_FOLDER_ID")
|
|
|
|
if apiKey == "" || folderID == "" {
|
|
return nil, fmt.Errorf("missing required Yandex Cloud environment variables")
|
|
}
|
|
|
|
// Создаем защищенное соединение для SpeechKit
|
|
creds := credentials.NewTLS(nil)
|
|
sttConn, err := grpc.NewClient(SpeechKitEndpoint, grpc.WithTransportCredentials(creds))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to connect to SpeechKit: %w", err)
|
|
}
|
|
|
|
// Создаем защищенное соединение для Operations API
|
|
opConn, err := grpc.NewClient(OperationEndpoint, grpc.WithTransportCredentials(creds))
|
|
if err != nil {
|
|
sttConn.Close()
|
|
return nil, fmt.Errorf("failed to connect to Operations API: %w", err)
|
|
}
|
|
|
|
sttClient := stt.NewAsyncRecognizerClient(sttConn)
|
|
opClient := operation.NewOperationServiceClient(opConn)
|
|
|
|
return &SpeechKitService{
|
|
sttConn: sttConn,
|
|
opConn: opConn,
|
|
sttClient: sttClient,
|
|
opClient: opClient,
|
|
apiKey: apiKey,
|
|
folderID: folderID,
|
|
}, nil
|
|
}
|
|
|
|
func (s *SpeechKitService) Close() error {
|
|
var err1, err2 error
|
|
if s.sttConn != nil {
|
|
err1 = s.sttConn.Close()
|
|
}
|
|
if s.opConn != nil {
|
|
err2 = s.opConn.Close()
|
|
}
|
|
if err1 != nil {
|
|
return err1
|
|
}
|
|
return err2
|
|
}
|
|
|
|
// RecognizeFileFromS3 запускает асинхронное распознавание файла из S3
|
|
func (s *SpeechKitService) RecognizeFileFromS3(s3URI string) (string, error) {
|
|
ctx := context.Background()
|
|
|
|
// Добавляем авторизацию и folder_id в контекст
|
|
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
|
|
ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)
|
|
|
|
// Создаем запрос на распознавание
|
|
req := &stt.RecognizeFileRequest{
|
|
AudioSource: &stt.RecognizeFileRequest_Uri{
|
|
Uri: s3URI,
|
|
},
|
|
RecognitionModel: &stt.RecognitionModelOptions{
|
|
Model: RecognitionModel,
|
|
AudioFormat: &stt.AudioFormatOptions{
|
|
AudioFormat: &stt.AudioFormatOptions_ContainerAudio{
|
|
ContainerAudio: &stt.ContainerAudio{
|
|
ContainerAudioType: stt.ContainerAudio_OGG_OPUS,
|
|
},
|
|
},
|
|
},
|
|
TextNormalization: &stt.TextNormalizationOptions{
|
|
TextNormalization: stt.TextNormalizationOptions_TEXT_NORMALIZATION_ENABLED,
|
|
ProfanityFilter: false,
|
|
LiteratureText: true,
|
|
},
|
|
AudioProcessingType: stt.RecognitionModelOptions_FULL_DATA,
|
|
},
|
|
SpeakerLabeling: &stt.SpeakerLabelingOptions{
|
|
SpeakerLabeling: stt.SpeakerLabelingOptions_SPEAKER_LABELING_ENABLED,
|
|
},
|
|
}
|
|
|
|
// Отправляем запрос
|
|
op, err := s.sttClient.RecognizeFile(ctx, req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to start recognition: %w", err)
|
|
}
|
|
|
|
return op.Id, nil
|
|
}
|
|
|
|
// GetRecognitionResult получает результат распознавания по ID операции
|
|
func (s *SpeechKitService) GetRecognitionText(operationID string) (string, error) {
|
|
ctx := context.Background()
|
|
|
|
// Добавляем авторизацию и folder_id в контекст
|
|
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
|
|
ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)
|
|
|
|
req := &stt.GetRecognitionRequest{
|
|
OperationId: operationID,
|
|
}
|
|
|
|
stream, err := s.sttClient.GetRecognition(ctx, req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to get recognition stream: %w", err)
|
|
}
|
|
|
|
var sb strings.Builder
|
|
|
|
for {
|
|
resp, err := stream.Recv()
|
|
if err != nil {
|
|
if err.Error() == "EOF" {
|
|
break
|
|
}
|
|
return "", fmt.Errorf("failed to receive recognition response: %w", err)
|
|
}
|
|
if refinement := resp.GetFinalRefinement(); refinement != nil {
|
|
if text := refinement.GetNormalizedText(); text != nil {
|
|
for _, alt := range text.Alternatives {
|
|
sb.WriteString(alt.Text)
|
|
sb.WriteString(" ")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return sb.String(), nil
|
|
}
|
|
|
|
// CheckOperationStatus проверяет статус операции распознавания
|
|
func (s *SpeechKitService) CheckOperationStatus(operationID string) (*operation.Operation, error) {
|
|
ctx := context.Background()
|
|
|
|
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
|
|
ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)
|
|
|
|
op, err := s.opClient.Get(ctx, &operation.GetOperationRequest{
|
|
OperationId: operationID,
|
|
})
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get operation status: %w", err)
|
|
}
|
|
|
|
return op, nil
|
|
}
|