Add initial audio recognition requests
This commit is contained in:
160
internal/service/speechkit/speechkit.go
Normal file
160
internal/service/speechkit/speechkit.go
Normal file
@@ -0,0 +1,160 @@
|
||||
package speechkit
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials"
|
||||
"google.golang.org/grpc/metadata"
|
||||
|
||||
stt "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3"
|
||||
"github.com/yandex-cloud/go-genproto/yandex/cloud/operation"
|
||||
)
|
||||
|
||||
const (
|
||||
SpeechKitEndpoint = "stt.api.cloud.yandex.net:443"
|
||||
)
|
||||
|
||||
type SpeechKitService struct {
|
||||
conn *grpc.ClientConn
|
||||
sttClient stt.AsyncRecognizerClient
|
||||
opClient operation.OperationServiceClient
|
||||
apiKey string
|
||||
folderID string
|
||||
}
|
||||
|
||||
func NewSpeechKitService() (*SpeechKitService, error) {
|
||||
apiKey := os.Getenv("YANDEX_CLOUD_API_KEY")
|
||||
folderID := os.Getenv("YANDEX_CLOUD_FOLDER_ID")
|
||||
|
||||
if apiKey == "" || folderID == "" {
|
||||
return nil, fmt.Errorf("missing required Yandex Cloud environment variables")
|
||||
}
|
||||
|
||||
// Создаем защищенное соединение
|
||||
creds := credentials.NewTLS(nil)
|
||||
conn, err := grpc.NewClient(SpeechKitEndpoint, grpc.WithTransportCredentials(creds))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to connect to SpeechKit: %w", err)
|
||||
}
|
||||
|
||||
sttClient := stt.NewAsyncRecognizerClient(conn)
|
||||
opClient := operation.NewOperationServiceClient(conn)
|
||||
|
||||
return &SpeechKitService{
|
||||
conn: conn,
|
||||
sttClient: sttClient,
|
||||
opClient: opClient,
|
||||
apiKey: apiKey,
|
||||
folderID: folderID,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *SpeechKitService) Close() error {
|
||||
return s.conn.Close()
|
||||
}
|
||||
|
||||
// RecognizeFileFromS3 запускает асинхронное распознавание файла из S3
|
||||
func (s *SpeechKitService) RecognizeFileFromS3(s3URI string) (string, error) {
|
||||
ctx := context.Background()
|
||||
|
||||
// Добавляем авторизацию в контекст
|
||||
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
|
||||
|
||||
// Создаем запрос на распознавание
|
||||
req := &stt.RecognizeFileRequest{
|
||||
AudioSource: &stt.RecognizeFileRequest_Uri{
|
||||
Uri: s3URI,
|
||||
},
|
||||
RecognitionModel: &stt.RecognitionModelOptions{
|
||||
Model: "general", // Используем общую модель
|
||||
AudioFormat: &stt.AudioFormatOptions{
|
||||
AudioFormat: &stt.AudioFormatOptions_ContainerAudio{
|
||||
ContainerAudio: &stt.ContainerAudio{
|
||||
ContainerAudioType: stt.ContainerAudio_OGG_OPUS,
|
||||
},
|
||||
},
|
||||
},
|
||||
TextNormalization: &stt.TextNormalizationOptions{
|
||||
TextNormalization: stt.TextNormalizationOptions_TEXT_NORMALIZATION_ENABLED,
|
||||
ProfanityFilter: false,
|
||||
LiteratureText: true,
|
||||
},
|
||||
AudioProcessingType: stt.RecognitionModelOptions_FULL_DATA,
|
||||
},
|
||||
SpeakerLabeling: &stt.SpeakerLabelingOptions{
|
||||
SpeakerLabeling: stt.SpeakerLabelingOptions_SPEAKER_LABELING_ENABLED,
|
||||
},
|
||||
}
|
||||
|
||||
// Отправляем запрос
|
||||
op, err := s.sttClient.RecognizeFile(ctx, req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to start recognition: %w", err)
|
||||
}
|
||||
|
||||
return op.Id, nil
|
||||
}
|
||||
|
||||
// GetRecognitionResult получает результат распознавания по ID операции
|
||||
func (s *SpeechKitService) GetRecognitionResult(operationID string) ([]*stt.StreamingResponse, error) {
|
||||
ctx := context.Background()
|
||||
|
||||
// Добавляем авторизацию в контекст
|
||||
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Api-Key "+s.apiKey)
|
||||
|
||||
req := &stt.GetRecognitionRequest{
|
||||
OperationId: operationID,
|
||||
}
|
||||
|
||||
stream, err := s.sttClient.GetRecognition(ctx, req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get recognition stream: %w", err)
|
||||
}
|
||||
|
||||
var responses []*stt.StreamingResponse
|
||||
for {
|
||||
resp, err := stream.Recv()
|
||||
if err != nil {
|
||||
if err.Error() == "EOF" {
|
||||
break
|
||||
}
|
||||
return nil, fmt.Errorf("failed to receive recognition response: %w", err)
|
||||
}
|
||||
responses = append(responses, resp)
|
||||
}
|
||||
|
||||
return responses, nil
|
||||
}
|
||||
|
||||
// CheckOperationStatus проверяет статус операции распознавания
|
||||
func (s *SpeechKitService) CheckOperationStatus(operationID string) (*operation.Operation, error) {
|
||||
ctx := context.Background()
|
||||
|
||||
op, err := s.opClient.Get(ctx, &operation.GetOperationRequest{
|
||||
OperationId: operationID,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get operation status: %w", err)
|
||||
}
|
||||
|
||||
return op, nil
|
||||
}
|
||||
|
||||
// ExtractTranscriptionText извлекает текст из результатов распознавания
|
||||
func ExtractTranscriptionText(responses []*stt.StreamingResponse) string {
|
||||
var fullText string
|
||||
|
||||
for _, resp := range responses {
|
||||
if final := resp.GetFinal(); final != nil {
|
||||
for _, alt := range final.Alternatives {
|
||||
fullText += alt.Text + " "
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return fullText
|
||||
}
|
Reference in New Issue
Block a user