163 lines
4.0 KiB
Go
163 lines
4.0 KiB
Go
package whisper
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os/exec"
|
|
)
|
|
|
|
// ModelSize represents the different Whisper model sizes
|
|
type ModelSize string
|
|
|
|
const (
|
|
ModelTiny ModelSize = "tiny"
|
|
ModelBase ModelSize = "base"
|
|
ModelSmall ModelSize = "small"
|
|
ModelMedium ModelSize = "medium"
|
|
ModelLarge ModelSize = "large"
|
|
ModelTurbo ModelSize = "turbo"
|
|
)
|
|
|
|
// TranscriptionResult contains the transcription output
|
|
type TranscriptionResult struct {
|
|
Text string `json:"text"`
|
|
Segments []Segment `json:"segments"`
|
|
Language string `json:"language"`
|
|
Duration float64 `json:"duration"`
|
|
}
|
|
|
|
// Segment represents a segment of transcription with timestamps
|
|
type Segment struct {
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Text string `json:"text"`
|
|
Words []Word `json:"words,omitempty"`
|
|
Speaker string `json:"speaker,omitempty"`
|
|
}
|
|
|
|
// Word represents a word with timestamp
|
|
type Word struct {
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Word string `json:"word"`
|
|
}
|
|
|
|
// Client is the Whisper client that handles transcription
|
|
type Client struct {
|
|
ModelPath string
|
|
ModelSize ModelSize
|
|
}
|
|
|
|
// NewClient creates a new Whisper client
|
|
func NewClient(modelSize ModelSize) *Client {
|
|
return &Client{
|
|
ModelSize: modelSize,
|
|
}
|
|
}
|
|
|
|
// Transcribe processes an audio file and returns transcription
|
|
func (c *Client) Transcribe(audioPath string, options *TranscriptionOptions) (*TranscriptionResult, error) {
|
|
if options == nil {
|
|
options = &TranscriptionOptions{}
|
|
}
|
|
|
|
// Build the Python command
|
|
cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options))
|
|
|
|
// Capture stdout and stderr
|
|
var out bytes.Buffer
|
|
var errBuf bytes.Buffer
|
|
cmd.Stdout = &out
|
|
cmd.Stderr = &errBuf
|
|
|
|
// Execute the command
|
|
err := cmd.Run()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("transcription failed: %v, stderr: %s", err, errBuf.String())
|
|
}
|
|
|
|
// Parse the JSON output
|
|
var result TranscriptionResult
|
|
err = json.Unmarshal(out.Bytes(), &result)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse transcription output: %v", err)
|
|
}
|
|
|
|
return &result, nil
|
|
}
|
|
|
|
// buildPythonCommand constructs the Python command for Whisper
|
|
func (c *Client) buildPythonCommand(audioPath string, options *TranscriptionOptions) string {
|
|
// Convert Go bool to Python bool string
|
|
verboseStr := "False"
|
|
if options.Verbose {
|
|
verboseStr = "True"
|
|
}
|
|
|
|
// Handle language option
|
|
langStr := "None"
|
|
if options.Language != "" && options.Language != "auto" {
|
|
langStr = fmt.Sprintf(`"%s"`, options.Language)
|
|
}
|
|
|
|
pythonCode := fmt.Sprintf(`
|
|
import whisper
|
|
import json
|
|
import sys
|
|
import os
|
|
import warnings
|
|
|
|
# Suppress warnings and stdout during transcription
|
|
warnings.filterwarnings("ignore")
|
|
old_stdout = sys.stdout
|
|
sys.stdout = open(os.devnull, 'w')
|
|
|
|
# Load model
|
|
model = whisper.load_model("%s")
|
|
|
|
# Transcribe
|
|
result = model.transcribe("%s",
|
|
language=%s,
|
|
verbose=%s,
|
|
temperature=%.1f,
|
|
best_of=%d)
|
|
|
|
# Restore stdout for JSON output
|
|
sys.stdout = old_stdout
|
|
|
|
# Output as JSON
|
|
print(json.dumps({
|
|
"text": result["text"],
|
|
"language": result.get("language", ""),
|
|
"duration": result.get("duration", 0.0),
|
|
"segments": [{
|
|
"start": seg["start"],
|
|
"end": seg["end"],
|
|
"text": seg["text"],
|
|
"words": seg.get("words", [])
|
|
} for seg in result.get("segments", [])]
|
|
}))
|
|
`, c.ModelSize, audioPath, langStr, verboseStr, options.Temperature, options.BestOf)
|
|
|
|
return pythonCode
|
|
}
|
|
|
|
// TranscriptionOptions contains options for transcription
|
|
type TranscriptionOptions struct {
|
|
Language string // Language code or "auto"
|
|
Verbose bool // Show progress bar
|
|
Temperature float64 // Temperature for sampling (higher = more creative)
|
|
BestOf int // Number of candidates when sampling with temperature > 0
|
|
}
|
|
|
|
// DefaultTranscriptionOptions returns default transcription options
|
|
func DefaultTranscriptionOptions() *TranscriptionOptions {
|
|
return &TranscriptionOptions{
|
|
Language: "auto",
|
|
Verbose: false,
|
|
Temperature: 0.0,
|
|
BestOf: 5,
|
|
}
|
|
}
|