Files
transcriber/internal/adapter/speechkit/speechkit.go

182 lines
5.1 KiB
Go

package speechkit
import (
"context"
"fmt"
"os"
"strings"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/metadata"
stt "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3"
"github.com/yandex-cloud/go-genproto/yandex/cloud/operation"
)
const (
SpeechKitEndpoint = "stt.api.cloud.yandex.net:443"
OperationEndpoint = "operation.api.cloud.yandex.net:443"
RecognitionModel = "deferred-general"
)
type SpeechKitService struct {
sttConn *grpc.ClientConn
opConn *grpc.ClientConn
sttClient stt.AsyncRecognizerClient
opClient operation.OperationServiceClient
apiKey string
folderID string
}
func NewSpeechKitService() (*SpeechKitService, error) {
apiKey := os.Getenv("YANDEX_CLOUD_API_KEY")
folderID := os.Getenv("YANDEX_CLOUD_FOLDER_ID")
if apiKey == "" || folderID == "" {
return nil, fmt.Errorf("missing required Yandex Cloud environment variables")
}
// Создаем защищенное соединение для SpeechKit
creds := credentials.NewTLS(nil)
sttConn, err := grpc.NewClient(SpeechKitEndpoint, grpc.WithTransportCredentials(creds))
if err != nil {
return nil, fmt.Errorf("failed to connect to SpeechKit: %w", err)
}
// Создаем защищенное соединение для Operations API
opConn, err := grpc.NewClient(OperationEndpoint, grpc.WithTransportCredentials(creds))
if err != nil {
sttConn.Close()
return nil, fmt.Errorf("failed to connect to Operations API: %w", err)
}
sttClient := stt.NewAsyncRecognizerClient(sttConn)
opClient := operation.NewOperationServiceClient(opConn)
return &SpeechKitService{
sttConn: sttConn,
opConn: opConn,
sttClient: sttClient,
opClient: opClient,
apiKey: apiKey,
folderID: folderID,
}, nil
}
func (s *SpeechKitService) Close() error {
var err1, err2 error
if s.sttConn != nil {
err1 = s.sttConn.Close()
}
if s.opConn != nil {
err2 = s.opConn.Close()
}
if err1 != nil {
return err1
}
return err2
}
// RecognizeFileFromS3 запускает асинхронное распознавание файла из S3
func (s *SpeechKitService) RecognizeFileFromS3(s3URI string) (string, error) {
ctx := context.Background()
// Добавляем авторизацию и folder_id в контекст
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)
// Создаем запрос на распознавание
req := &stt.RecognizeFileRequest{
AudioSource: &stt.RecognizeFileRequest_Uri{
Uri: s3URI,
},
RecognitionModel: &stt.RecognitionModelOptions{
Model: RecognitionModel,
AudioFormat: &stt.AudioFormatOptions{
AudioFormat: &stt.AudioFormatOptions_ContainerAudio{
ContainerAudio: &stt.ContainerAudio{
ContainerAudioType: stt.ContainerAudio_OGG_OPUS,
},
},
},
TextNormalization: &stt.TextNormalizationOptions{
TextNormalization: stt.TextNormalizationOptions_TEXT_NORMALIZATION_ENABLED,
ProfanityFilter: false,
LiteratureText: true,
},
AudioProcessingType: stt.RecognitionModelOptions_FULL_DATA,
},
SpeakerLabeling: &stt.SpeakerLabelingOptions{
SpeakerLabeling: stt.SpeakerLabelingOptions_SPEAKER_LABELING_ENABLED,
},
}
// Отправляем запрос
op, err := s.sttClient.RecognizeFile(ctx, req)
if err != nil {
return "", fmt.Errorf("failed to start recognition: %w", err)
}
return op.Id, nil
}
// GetRecognitionResult получает результат распознавания по ID операции
func (s *SpeechKitService) GetRecognitionText(operationID string) (string, error) {
ctx := context.Background()
// Добавляем авторизацию и folder_id в контекст
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)
req := &stt.GetRecognitionRequest{
OperationId: operationID,
}
stream, err := s.sttClient.GetRecognition(ctx, req)
if err != nil {
return "", fmt.Errorf("failed to get recognition stream: %w", err)
}
var sb strings.Builder
for {
resp, err := stream.Recv()
if err != nil {
if err.Error() == "EOF" {
break
}
return "", fmt.Errorf("failed to receive recognition response: %w", err)
}
if refinement := resp.GetFinalRefinement(); refinement != nil {
if text := refinement.GetNormalizedText(); text != nil {
for _, alt := range text.Alternatives {
sb.WriteString(alt.Text)
sb.WriteString(" ")
}
}
}
}
return sb.String(), nil
}
// CheckOperationStatus проверяет статус операции распознавания
func (s *SpeechKitService) CheckOperationStatus(operationID string) (*operation.Operation, error) {
ctx := context.Background()
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
ctx = metadata.AppendToOutgoingContext(ctx, "x-folder-id", s.folderID)
op, err := s.opClient.Get(ctx, &operation.GetOperationRequest{
OperationId: operationID,
})
if err != nil {
return nil, fmt.Errorf("failed to get operation status: %w", err)
}
return op, nil
}