feat: git init
This commit is contained in:
162
internal/whisper/client.go
Normal file
162
internal/whisper/client.go
Normal file
@@ -0,0 +1,162 @@
|
||||
package whisper
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
// ModelSize represents the different Whisper model sizes
|
||||
type ModelSize string
|
||||
|
||||
const (
|
||||
ModelTiny ModelSize = "tiny"
|
||||
ModelBase ModelSize = "base"
|
||||
ModelSmall ModelSize = "small"
|
||||
ModelMedium ModelSize = "medium"
|
||||
ModelLarge ModelSize = "large"
|
||||
ModelTurbo ModelSize = "turbo"
|
||||
)
|
||||
|
||||
// TranscriptionResult contains the transcription output
|
||||
type TranscriptionResult struct {
|
||||
Text string `json:"text"`
|
||||
Segments []Segment `json:"segments"`
|
||||
Language string `json:"language"`
|
||||
Duration float64 `json:"duration"`
|
||||
}
|
||||
|
||||
// Segment represents a segment of transcription with timestamps
|
||||
type Segment struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Words []Word `json:"words,omitempty"`
|
||||
Speaker string `json:"speaker,omitempty"`
|
||||
}
|
||||
|
||||
// Word represents a word with timestamp
|
||||
type Word struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Word string `json:"word"`
|
||||
}
|
||||
|
||||
// Client is the Whisper client that handles transcription
|
||||
type Client struct {
|
||||
ModelPath string
|
||||
ModelSize ModelSize
|
||||
}
|
||||
|
||||
// NewClient creates a new Whisper client
|
||||
func NewClient(modelSize ModelSize) *Client {
|
||||
return &Client{
|
||||
ModelSize: modelSize,
|
||||
}
|
||||
}
|
||||
|
||||
// Transcribe processes an audio file and returns transcription
|
||||
func (c *Client) Transcribe(audioPath string, options *TranscriptionOptions) (*TranscriptionResult, error) {
|
||||
if options == nil {
|
||||
options = &TranscriptionOptions{}
|
||||
}
|
||||
|
||||
// Build the Python command
|
||||
cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options))
|
||||
|
||||
// Capture stdout and stderr
|
||||
var out bytes.Buffer
|
||||
var errBuf bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
cmd.Stderr = &errBuf
|
||||
|
||||
// Execute the command
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("transcription failed: %v, stderr: %s", err, errBuf.String())
|
||||
}
|
||||
|
||||
// Parse the JSON output
|
||||
var result TranscriptionResult
|
||||
err = json.Unmarshal(out.Bytes(), &result)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse transcription output: %v", err)
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
// buildPythonCommand constructs the Python command for Whisper
|
||||
func (c *Client) buildPythonCommand(audioPath string, options *TranscriptionOptions) string {
|
||||
// Convert Go bool to Python bool string
|
||||
verboseStr := "False"
|
||||
if options.Verbose {
|
||||
verboseStr = "True"
|
||||
}
|
||||
|
||||
// Handle language option
|
||||
langStr := "None"
|
||||
if options.Language != "" && options.Language != "auto" {
|
||||
langStr = fmt.Sprintf(`"%s"`, options.Language)
|
||||
}
|
||||
|
||||
pythonCode := fmt.Sprintf(`
|
||||
import whisper
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import warnings
|
||||
|
||||
# Suppress warnings and stdout during transcription
|
||||
warnings.filterwarnings("ignore")
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = open(os.devnull, 'w')
|
||||
|
||||
# Load model
|
||||
model = whisper.load_model("%s")
|
||||
|
||||
# Transcribe
|
||||
result = model.transcribe("%s",
|
||||
language=%s,
|
||||
verbose=%s,
|
||||
temperature=%.1f,
|
||||
best_of=%d)
|
||||
|
||||
# Restore stdout for JSON output
|
||||
sys.stdout = old_stdout
|
||||
|
||||
# Output as JSON
|
||||
print(json.dumps({
|
||||
"text": result["text"],
|
||||
"language": result.get("language", ""),
|
||||
"duration": result.get("duration", 0.0),
|
||||
"segments": [{
|
||||
"start": seg["start"],
|
||||
"end": seg["end"],
|
||||
"text": seg["text"],
|
||||
"words": seg.get("words", [])
|
||||
} for seg in result.get("segments", [])]
|
||||
}))
|
||||
`, c.ModelSize, audioPath, langStr, verboseStr, options.Temperature, options.BestOf)
|
||||
|
||||
return pythonCode
|
||||
}
|
||||
|
||||
// TranscriptionOptions contains options for transcription
|
||||
type TranscriptionOptions struct {
|
||||
Language string // Language code or "auto"
|
||||
Verbose bool // Show progress bar
|
||||
Temperature float64 // Temperature for sampling (higher = more creative)
|
||||
BestOf int // Number of candidates when sampling with temperature > 0
|
||||
}
|
||||
|
||||
// DefaultTranscriptionOptions returns default transcription options
|
||||
func DefaultTranscriptionOptions() *TranscriptionOptions {
|
||||
return &TranscriptionOptions{
|
||||
Language: "auto",
|
||||
Verbose: false,
|
||||
Temperature: 0.0,
|
||||
BestOf: 5,
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user