transcribe/internal/whisper/client.go

package whisper

import (
	"bytes"
	"encoding/json"
	"fmt"
	"os/exec"
)

// ModelSize represents the different Whisper model sizes
type ModelSize string

const (
	ModelTiny   ModelSize = "tiny"
	ModelBase   ModelSize = "base"
	ModelSmall  ModelSize = "small"
	ModelMedium ModelSize = "medium"
	ModelLarge  ModelSize = "large"
	ModelTurbo  ModelSize = "turbo"
)

// TranscriptionResult contains the transcription output
type TranscriptionResult struct {
	Text     string    `json:"text"`
	Segments []Segment `json:"segments"`
	Language string    `json:"language"`
	Duration float64   `json:"duration"`
}

// Segment represents a segment of transcription with timestamps
type Segment struct {
	Start   float64 `json:"start"`
	End     float64 `json:"end"`
	Text    string  `json:"text"`
	Words   []Word  `json:"words,omitempty"`
	Speaker string  `json:"speaker,omitempty"`
}

// Word represents a word with timestamp
type Word struct {
	Start float64 `json:"start"`
	End   float64 `json:"end"`
	Word  string  `json:"word"`
}

// Client is the Whisper client that handles transcription
type Client struct {
	ModelPath string
	ModelSize ModelSize
}

// NewClient creates a new Whisper client
func NewClient(modelSize ModelSize) *Client {
	return &Client{
		ModelSize: modelSize,
	}
}

// Transcribe processes an audio file and returns transcription
func (c *Client) Transcribe(audioPath string, options *TranscriptionOptions) (*TranscriptionResult, error) {
	if options == nil {
		options = &TranscriptionOptions{}
	}

	// Build the Python command
	cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options))

	// Capture stdout and stderr
	var out bytes.Buffer
	var errBuf bytes.Buffer
	cmd.Stdout = &out
	cmd.Stderr = &errBuf

	// Execute the command
	err := cmd.Run()
	if err != nil {
		return nil, fmt.Errorf("transcription failed: %v, stderr: %s", err, errBuf.String())
	}

	// Parse the JSON output
	var result TranscriptionResult
	err = json.Unmarshal(out.Bytes(), &result)
	if err != nil {
		return nil, fmt.Errorf("failed to parse transcription output: %v", err)
	}

	return &result, nil
}

// buildPythonCommand constructs the Python command for Whisper
func (c *Client) buildPythonCommand(audioPath string, options *TranscriptionOptions) string {
	// Convert Go bool to Python bool string
	verboseStr := "False"
	if options.Verbose {
		verboseStr = "True"
	}

	// Handle language option
	langStr := "None"
	if options.Language != "" && options.Language != "auto" {
		langStr = fmt.Sprintf(`"%s"`, options.Language)
	}

	pythonCode := fmt.Sprintf(`
import whisper
import json
import sys
import os
import warnings

# Suppress warnings and stdout during transcription
warnings.filterwarnings("ignore")
old_stdout = sys.stdout
sys.stdout = open(os.devnull, 'w')

# Load model
model = whisper.load_model("%s")

# Transcribe
result = model.transcribe("%s",
    language=%s,
    verbose=%s,
    temperature=%.1f,
    best_of=%d)

# Restore stdout for JSON output
sys.stdout = old_stdout

# Output as JSON
print(json.dumps({
    "text": result["text"],
    "language": result.get("language", ""),
    "duration": result.get("duration", 0.0),
    "segments": [{
        "start": seg["start"],
        "end": seg["end"],
        "text": seg["text"],
        "words": seg.get("words", [])
    } for seg in result.get("segments", [])]
}))
`, c.ModelSize, audioPath, langStr, verboseStr, options.Temperature, options.BestOf)

	return pythonCode
}

// TranscriptionOptions contains options for transcription
type TranscriptionOptions struct {
	Language    string  // Language code or "auto"
	Verbose     bool    // Show progress bar
	Temperature float64 // Temperature for sampling (higher = more creative)
	BestOf      int     // Number of candidates when sampling with temperature > 0
}

// DefaultTranscriptionOptions returns default transcription options
func DefaultTranscriptionOptions() *TranscriptionOptions {
	return &TranscriptionOptions{
		Language:    "auto",
		Verbose:     false,
		Temperature: 0.0,
		BestOf:      5,
	}
}