173 lines
5.2 KiB
Go
173 lines
5.2 KiB
Go
package cmd
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
|
|
"transcribe/internal/diarization"
|
|
"transcribe/internal/whisper"
|
|
"transcribe/pkg/audio"
|
|
"transcribe/pkg/output"
|
|
"transcribe/pkg/progress"
|
|
|
|
"github.com/spf13/cobra"
|
|
)
|
|
|
|
var Version = "dev"
|
|
|
|
var outputFile string
|
|
var outputFormat string
|
|
var diarize bool
|
|
var numSpeakers int
|
|
var modelSize string
|
|
var noWrite bool
|
|
|
|
// rootCmd represents the base command when called without any subcommands
|
|
var rootCmd = &cobra.Command{
|
|
Use: "transcribe",
|
|
Short: "A CLI tool for transcribing audio files with speaker diarization",
|
|
Long: `Transcribe is a command-line tool that uses OpenAI's Whisper model to
|
|
transcribe audio files. It supports multiple output formats (text, SRT, JSON)
|
|
and speaker diarization using voice embeddings.
|
|
|
|
Output file (-o) is required unless --no-write is specified.
|
|
|
|
Output Formats:
|
|
srt SRT subtitle format (default)
|
|
text Plain text with timestamps
|
|
json JSON with full metadata
|
|
|
|
Whisper Models (--model, -m):
|
|
tiny Fastest, least accurate (default)
|
|
base Fast, basic accuracy
|
|
small Balanced speed/accuracy
|
|
medium Good accuracy, slower
|
|
large Best accuracy, slowest
|
|
turbo Optimized for speed
|
|
|
|
Examples:
|
|
# Basic transcription to SRT
|
|
transcribe audio.mp3 -o output.srt
|
|
|
|
# Use a larger model
|
|
transcribe audio.mp3 --model small -o output.srt
|
|
|
|
# Output as plain text
|
|
transcribe audio.mp3 --format text -o output.txt
|
|
|
|
# Enable speaker diarization
|
|
transcribe audio.mp3 --diarize -o output.srt
|
|
|
|
# Print to stdout instead of file
|
|
transcribe audio.mp3 --no-write
|
|
|
|
# Full example: diarization + specific model
|
|
transcribe audio.mp3 --model small --diarize -s 2 -o output.srt`,
|
|
Run: func(cmd *cobra.Command, args []string) {
|
|
if len(args) == 0 {
|
|
fmt.Println("Please provide audio files to transcribe")
|
|
_ = cmd.Help()
|
|
os.Exit(1)
|
|
}
|
|
|
|
// Require output file unless --no-write is set
|
|
if outputFile == "" && !noWrite {
|
|
fmt.Println("✗ Error: Output file required. Use -o <file> to specify output, or --no-write to print to stdout.")
|
|
os.Exit(1)
|
|
}
|
|
|
|
// Validate all provided files
|
|
for _, file := range args {
|
|
if _, err := os.Stat(file); os.IsNotExist(err) {
|
|
fmt.Printf("✗ Error: File '%s' does not exist\n", file)
|
|
os.Exit(1)
|
|
}
|
|
|
|
_, err := audio.NewAudioFile(file)
|
|
if err != nil {
|
|
fmt.Printf("✗ Error: File '%s' has unsupported format or error: %v\n", file, err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
// Create whisper client and transcribe
|
|
whisperClient := whisper.NewClient(whisper.ModelSize(modelSize))
|
|
whisperOptions := whisper.DefaultTranscriptionOptions()
|
|
|
|
// Create diarization client if needed
|
|
var diarizationClient *diarization.Client
|
|
var diarizationOptions *diarization.DiarizationOptions
|
|
if diarize {
|
|
diarizationClient = diarization.NewClient()
|
|
diarizationOptions = &diarization.DiarizationOptions{
|
|
NumSpeakers: numSpeakers,
|
|
}
|
|
}
|
|
|
|
// Create output formatter
|
|
formatter := output.NewFormatter(output.FormatType(outputFormat))
|
|
|
|
for _, file := range args {
|
|
// Transcription with spinner
|
|
spinner := progress.NewSpinner(fmt.Sprintf("Transcribing %s (model: %s)...", file, modelSize))
|
|
spinner.Start()
|
|
result, err := whisperClient.Transcribe(file, whisperOptions)
|
|
if err != nil {
|
|
spinner.StopWithMessage(fmt.Sprintf("✗ Error transcribing %s: %v", file, err))
|
|
continue
|
|
}
|
|
spinner.StopWithMessage(fmt.Sprintf("✓ Transcribed %s (%.1fs audio)", file, result.Duration))
|
|
|
|
// Run diarization if enabled
|
|
if diarize {
|
|
spinner := progress.NewSpinner("Detecting speakers...")
|
|
spinner.Start()
|
|
diarizationResult, err := diarizationClient.Diarize(file, diarizationOptions)
|
|
if err != nil {
|
|
spinner.StopWithMessage(fmt.Sprintf("✗ Diarization failed: %v", err))
|
|
} else {
|
|
spinner.StopWithMessage(fmt.Sprintf("✓ Detected %d speaker(s)", diarizationResult.NumSpeakers))
|
|
diarization.AlignSpeakers(result, diarizationResult)
|
|
}
|
|
}
|
|
|
|
// Format output
|
|
formattedOutput, err := formatter.Format(result)
|
|
if err != nil {
|
|
fmt.Printf("Error formatting output: %v\n", err)
|
|
continue
|
|
}
|
|
|
|
// Write to file or stdout
|
|
if outputFile != "" {
|
|
err := os.WriteFile(outputFile, []byte(formattedOutput), 0644)
|
|
if err != nil {
|
|
fmt.Printf("✗ Error writing output file: %v\n", err)
|
|
} else {
|
|
fmt.Printf("✓ Saved to %s\n", outputFile)
|
|
}
|
|
} else {
|
|
fmt.Printf("\n%s\n", formattedOutput)
|
|
}
|
|
}
|
|
},
|
|
}
|
|
|
|
func init() {
|
|
rootCmd.Version = Version
|
|
rootCmd.PersistentFlags().StringVarP(&outputFile, "output", "o", "", "Output file path (required)")
|
|
rootCmd.PersistentFlags().StringVarP(&outputFormat, "format", "f", "srt", "Output format: text, srt, json")
|
|
rootCmd.PersistentFlags().BoolVar(&diarize, "diarize", false, "Enable speaker diarization")
|
|
rootCmd.PersistentFlags().IntVarP(&numSpeakers, "speakers", "s", 0, "Number of speakers (0 = auto-detect)")
|
|
rootCmd.PersistentFlags().StringVarP(&modelSize, "model", "m", "tiny", "Whisper model: tiny, base, small, medium, large, turbo")
|
|
rootCmd.PersistentFlags().BoolVar(&noWrite, "no-write", false, "Print output to stdout instead of file")
|
|
}
|
|
|
|
// Execute adds all child commands to the root command and sets flags appropriately.
|
|
func Execute() {
|
|
if err := rootCmd.Execute(); err != nil {
|
|
fmt.Println(err)
|
|
os.Exit(1)
|
|
}
|
|
}
|