2023-11-13 21:40:16 +00:00
|
|
|
package main
|
2023-05-09 09:43:50 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"os/exec"
|
|
|
|
"path/filepath"
|
|
|
|
|
|
|
|
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
2023-08-20 12:04:45 +00:00
|
|
|
"github.com/go-audio/wav"
|
2024-02-21 01:21:19 +00:00
|
|
|
"github.com/go-skynet/LocalAI/core/schema"
|
2023-05-09 09:43:50 +00:00
|
|
|
)
|
|
|
|
|
2024-02-29 18:32:29 +00:00
|
|
|
func runCommand(command []string) (string, error) {
|
|
|
|
cmd := exec.Command(command[0], command[1:]...)
|
2023-05-09 09:43:50 +00:00
|
|
|
cmd.Env = os.Environ()
|
2024-02-29 18:32:29 +00:00
|
|
|
out, err := cmd.CombinedOutput()
|
|
|
|
return string(out), err
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|
|
|
|
|
2024-02-29 18:32:29 +00:00
|
|
|
// AudioToWav converts audio to wav for transcribe.
|
2023-05-09 09:43:50 +00:00
|
|
|
// TODO: use https://github.com/mccoyst/ogg?
|
|
|
|
func audioToWav(src, dst string) error {
|
2024-02-29 18:32:29 +00:00
|
|
|
command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
|
|
|
|
out, err := runCommand(command)
|
2023-05-09 09:43:50 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error: %w out: %s", err, out)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2024-01-05 17:04:46 +00:00
|
|
|
func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.Result, error) {
|
|
|
|
res := schema.Result{}
|
2023-05-09 09:43:50 +00:00
|
|
|
|
|
|
|
dir, err := os.MkdirTemp("", "whisper")
|
|
|
|
if err != nil {
|
2023-07-04 12:31:31 +00:00
|
|
|
return res, err
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|
|
|
|
defer os.RemoveAll(dir)
|
|
|
|
|
|
|
|
convertedPath := filepath.Join(dir, "converted.wav")
|
|
|
|
|
|
|
|
if err := audioToWav(audiopath, convertedPath); err != nil {
|
2023-07-04 12:31:31 +00:00
|
|
|
return res, err
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Open samples
|
|
|
|
fh, err := os.Open(convertedPath)
|
|
|
|
if err != nil {
|
2023-07-04 12:31:31 +00:00
|
|
|
return res, err
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|
|
|
|
defer fh.Close()
|
|
|
|
|
|
|
|
// Read samples
|
|
|
|
d := wav.NewDecoder(fh)
|
|
|
|
buf, err := d.FullPCMBuffer()
|
|
|
|
if err != nil {
|
2023-07-04 12:31:31 +00:00
|
|
|
return res, err
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
data := buf.AsFloat32Buffer().Data
|
|
|
|
|
|
|
|
// Process samples
|
|
|
|
context, err := model.NewContext()
|
|
|
|
if err != nil {
|
2023-07-04 12:31:31 +00:00
|
|
|
return res, err
|
2023-05-09 09:43:50 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2023-05-12 08:04:20 +00:00
|
|
|
context.SetThreads(threads)
|
|
|
|
|
2023-05-09 09:43:50 +00:00
|
|
|
if language != "" {
|
|
|
|
context.SetLanguage(language)
|
2023-05-12 08:04:20 +00:00
|
|
|
} else {
|
|
|
|
context.SetLanguage("auto")
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|
|
|
|
|
2023-06-29 09:26:07 +00:00
|
|
|
if err := context.Process(data, nil, nil); err != nil {
|
2023-07-04 12:31:31 +00:00
|
|
|
return res, err
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for {
|
2023-07-04 12:31:31 +00:00
|
|
|
s, err := context.NextSegment()
|
2023-05-09 09:43:50 +00:00
|
|
|
if err != nil {
|
|
|
|
break
|
|
|
|
}
|
2023-07-04 12:31:31 +00:00
|
|
|
|
|
|
|
var tokens []int
|
2023-07-14 23:19:43 +00:00
|
|
|
for _, t := range s.Tokens {
|
2023-07-04 12:31:31 +00:00
|
|
|
tokens = append(tokens, t.Id)
|
|
|
|
}
|
|
|
|
|
2024-01-05 17:04:46 +00:00
|
|
|
segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
|
2023-07-04 12:31:31 +00:00
|
|
|
res.Segments = append(res.Segments, segment)
|
|
|
|
|
|
|
|
res.Text += s.Text
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|
|
|
|
|
2023-07-04 12:31:31 +00:00
|
|
|
return res, nil
|
2023-05-09 09:43:50 +00:00
|
|
|
}
|