package whisper import ( "bytes" "encoding/json" "fmt" "os/exec" ) // ModelSize represents the different Whisper model sizes type ModelSize string const ( ModelTiny ModelSize = "tiny" ModelBase ModelSize = "base" ModelSmall ModelSize = "small" ModelMedium ModelSize = "medium" ModelLarge ModelSize = "large" ModelTurbo ModelSize = "turbo" ) // TranscriptionResult contains the transcription output type TranscriptionResult struct { Text string `json:"text"` Segments []Segment `json:"segments"` Language string `json:"language"` Duration float64 `json:"duration"` } // Segment represents a segment of transcription with timestamps type Segment struct { Start float64 `json:"start"` End float64 `json:"end"` Text string `json:"text"` Words []Word `json:"words,omitempty"` Speaker string `json:"speaker,omitempty"` } // Word represents a word with timestamp type Word struct { Start float64 `json:"start"` End float64 `json:"end"` Word string `json:"word"` } // Client is the Whisper client that handles transcription type Client struct { ModelPath string ModelSize ModelSize } // NewClient creates a new Whisper client func NewClient(modelSize ModelSize) *Client { return &Client{ ModelSize: modelSize, } } // Transcribe processes an audio file and returns transcription func (c *Client) Transcribe(audioPath string, options *TranscriptionOptions) (*TranscriptionResult, error) { if options == nil { options = &TranscriptionOptions{} } // Build the Python command cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options)) // Capture stdout and stderr var out bytes.Buffer var errBuf bytes.Buffer cmd.Stdout = &out cmd.Stderr = &errBuf // Execute the command err := cmd.Run() if err != nil { return nil, fmt.Errorf("transcription failed: %v, stderr: %s", err, errBuf.String()) } // Parse the JSON output var result TranscriptionResult err = json.Unmarshal(out.Bytes(), &result) if err != nil { return nil, fmt.Errorf("failed to parse transcription output: %v", err) } return &result, nil } // buildPythonCommand constructs the Python command for Whisper func (c *Client) buildPythonCommand(audioPath string, options *TranscriptionOptions) string { // Convert Go bool to Python bool string verboseStr := "False" if options.Verbose { verboseStr = "True" } // Handle language option langStr := "None" if options.Language != "" && options.Language != "auto" { langStr = fmt.Sprintf(`"%s"`, options.Language) } pythonCode := fmt.Sprintf(` import whisper import json import sys import os import warnings # Suppress warnings and stdout during transcription warnings.filterwarnings("ignore") old_stdout = sys.stdout sys.stdout = open(os.devnull, 'w') # Load model model = whisper.load_model("%s") # Transcribe result = model.transcribe("%s", language=%s, verbose=%s, temperature=%.1f, best_of=%d) # Restore stdout for JSON output sys.stdout = old_stdout # Output as JSON print(json.dumps({ "text": result["text"], "language": result.get("language", ""), "duration": result.get("duration", 0.0), "segments": [{ "start": seg["start"], "end": seg["end"], "text": seg["text"], "words": seg.get("words", []) } for seg in result.get("segments", [])] })) `, c.ModelSize, audioPath, langStr, verboseStr, options.Temperature, options.BestOf) return pythonCode } // TranscriptionOptions contains options for transcription type TranscriptionOptions struct { Language string // Language code or "auto" Verbose bool // Show progress bar Temperature float64 // Temperature for sampling (higher = more creative) BestOf int // Number of candidates when sampling with temperature > 0 } // DefaultTranscriptionOptions returns default transcription options func DefaultTranscriptionOptions() *TranscriptionOptions { return &TranscriptionOptions{ Language: "auto", Verbose: false, Temperature: 0.0, BestOf: 5, } }