feat: git init

2026-01-17 19:18:58 -06:00
commit b73d5b8078
18 changed files with 1274 additions and 0 deletions
--- a/internal/whisper/client.go
+++ b/internal/whisper/client.go
@@ -0,0 +1,162 @@
+package whisper
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+)
+
+// ModelSize represents the different Whisper model sizes
+type ModelSize string
+
+const (
+	ModelTiny   ModelSize = "tiny"
+	ModelBase   ModelSize = "base"
+	ModelSmall  ModelSize = "small"
+	ModelMedium ModelSize = "medium"
+	ModelLarge  ModelSize = "large"
+	ModelTurbo  ModelSize = "turbo"
+)
+
+// TranscriptionResult contains the transcription output
+type TranscriptionResult struct {
+	Text     string    `json:"text"`
+	Segments []Segment `json:"segments"`
+	Language string    `json:"language"`
+	Duration float64   `json:"duration"`
+}
+
+// Segment represents a segment of transcription with timestamps
+type Segment struct {
+	Start   float64 `json:"start"`
+	End     float64 `json:"end"`
+	Text    string  `json:"text"`
+	Words   []Word  `json:"words,omitempty"`
+	Speaker string  `json:"speaker,omitempty"`
+}
+
+// Word represents a word with timestamp
+type Word struct {
+	Start float64 `json:"start"`
+	End   float64 `json:"end"`
+	Word  string  `json:"word"`
+}
+
+// Client is the Whisper client that handles transcription
+type Client struct {
+	ModelPath string
+	ModelSize ModelSize
+}
+
+// NewClient creates a new Whisper client
+func NewClient(modelSize ModelSize) *Client {
+	return &Client{
+		ModelSize: modelSize,
+	}
+}
+
+// Transcribe processes an audio file and returns transcription
+func (c *Client) Transcribe(audioPath string, options *TranscriptionOptions) (*TranscriptionResult, error) {
+	if options == nil {
+		options = &TranscriptionOptions{}
+	}
+
+	// Build the Python command
+	cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options))
+
+	// Capture stdout and stderr
+	var out bytes.Buffer
+	var errBuf bytes.Buffer
+	cmd.Stdout = &out
+	cmd.Stderr = &errBuf
+
+	// Execute the command
+	err := cmd.Run()
+	if err != nil {
+		return nil, fmt.Errorf("transcription failed: %v, stderr: %s", err, errBuf.String())
+	}
+
+	// Parse the JSON output
+	var result TranscriptionResult
+	err = json.Unmarshal(out.Bytes(), &result)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse transcription output: %v", err)
+	}
+
+	return &result, nil
+}
+
+// buildPythonCommand constructs the Python command for Whisper
+func (c *Client) buildPythonCommand(audioPath string, options *TranscriptionOptions) string {
+	// Convert Go bool to Python bool string
+	verboseStr := "False"
+	if options.Verbose {
+		verboseStr = "True"
+	}
+
+	// Handle language option
+	langStr := "None"
+	if options.Language != "" && options.Language != "auto" {
+		langStr = fmt.Sprintf(`"%s"`, options.Language)
+	}
+
+	pythonCode := fmt.Sprintf(`
+import whisper
+import json
+import sys
+import os
+import warnings
+
+# Suppress warnings and stdout during transcription
+warnings.filterwarnings("ignore")
+old_stdout = sys.stdout
+sys.stdout = open(os.devnull, 'w')
+
+# Load model
+model = whisper.load_model("%s")
+
+# Transcribe
+result = model.transcribe("%s",
+    language=%s,
+    verbose=%s,
+    temperature=%.1f,
+    best_of=%d)
+
+# Restore stdout for JSON output
+sys.stdout = old_stdout
+
+# Output as JSON
+print(json.dumps({
+    "text": result["text"],
+    "language": result.get("language", ""),
+    "duration": result.get("duration", 0.0),
+    "segments": [{
+        "start": seg["start"],
+        "end": seg["end"],
+        "text": seg["text"],
+        "words": seg.get("words", [])
+    } for seg in result.get("segments", [])]
+}))
+`, c.ModelSize, audioPath, langStr, verboseStr, options.Temperature, options.BestOf)
+
+	return pythonCode
+}
+
+// TranscriptionOptions contains options for transcription
+type TranscriptionOptions struct {
+	Language    string  // Language code or "auto"
+	Verbose     bool    // Show progress bar
+	Temperature float64 // Temperature for sampling (higher = more creative)
+	BestOf      int     // Number of candidates when sampling with temperature > 0
+}
+
+// DefaultTranscriptionOptions returns default transcription options
+func DefaultTranscriptionOptions() *TranscriptionOptions {
+	return &TranscriptionOptions{
+		Language:    "auto",
+		Verbose:     false,
+		Temperature: 0.0,
+		BestOf:      5,
+	}
+}