package cmd import ( "fmt" "os" "transcribe/internal/diarization" "transcribe/internal/whisper" "transcribe/pkg/audio" "transcribe/pkg/output" "transcribe/pkg/progress" "github.com/spf13/cobra" ) var Version = "dev" var outputFile string var outputFormat string var diarize bool var numSpeakers int var modelSize string var noWrite bool // rootCmd represents the base command when called without any subcommands var rootCmd = &cobra.Command{ Use: "transcribe", Short: "A CLI tool for transcribing audio files with speaker diarization", Long: `Transcribe is a command-line tool that uses OpenAI's Whisper model to transcribe audio files. It supports multiple output formats (text, SRT, JSON) and speaker diarization using voice embeddings. Output file (-o) is required unless --no-write is specified. Output Formats: srt SRT subtitle format (default) text Plain text with timestamps json JSON with full metadata Whisper Models (--model, -m): tiny Fastest, least accurate (default) base Fast, basic accuracy small Balanced speed/accuracy medium Good accuracy, slower large Best accuracy, slowest turbo Optimized for speed Examples: # Basic transcription to SRT transcribe audio.mp3 -o output.srt # Use a larger model transcribe audio.mp3 --model small -o output.srt # Output as plain text transcribe audio.mp3 --format text -o output.txt # Enable speaker diarization transcribe audio.mp3 --diarize -o output.srt # Print to stdout instead of file transcribe audio.mp3 --no-write # Full example: diarization + specific model transcribe audio.mp3 --model small --diarize -s 2 -o output.srt`, Run: func(cmd *cobra.Command, args []string) { if len(args) == 0 { fmt.Println("Please provide audio files to transcribe") _ = cmd.Help() os.Exit(1) } // Require output file unless --no-write is set if outputFile == "" && !noWrite { fmt.Println("✗ Error: Output file required. Use -o to specify output, or --no-write to print to stdout.") os.Exit(1) } // Validate all provided files for _, file := range args { if _, err := os.Stat(file); os.IsNotExist(err) { fmt.Printf("✗ Error: File '%s' does not exist\n", file) os.Exit(1) } _, err := audio.NewAudioFile(file) if err != nil { fmt.Printf("✗ Error: File '%s' has unsupported format or error: %v\n", file, err) os.Exit(1) } } // Create whisper client and transcribe whisperClient := whisper.NewClient(whisper.ModelSize(modelSize)) whisperOptions := whisper.DefaultTranscriptionOptions() // Create diarization client if needed var diarizationClient *diarization.Client var diarizationOptions *diarization.DiarizationOptions if diarize { diarizationClient = diarization.NewClient() diarizationOptions = &diarization.DiarizationOptions{ NumSpeakers: numSpeakers, } } // Create output formatter formatter := output.NewFormatter(output.FormatType(outputFormat)) for _, file := range args { // Transcription with spinner spinner := progress.NewSpinner(fmt.Sprintf("Transcribing %s (model: %s)...", file, modelSize)) spinner.Start() result, err := whisperClient.Transcribe(file, whisperOptions) if err != nil { spinner.StopWithMessage(fmt.Sprintf("✗ Error transcribing %s: %v", file, err)) continue } spinner.StopWithMessage(fmt.Sprintf("✓ Transcribed %s (%.1fs audio)", file, result.Duration)) // Run diarization if enabled if diarize { spinner := progress.NewSpinner("Detecting speakers...") spinner.Start() diarizationResult, err := diarizationClient.Diarize(file, diarizationOptions) if err != nil { spinner.StopWithMessage(fmt.Sprintf("✗ Diarization failed: %v", err)) } else { spinner.StopWithMessage(fmt.Sprintf("✓ Detected %d speaker(s)", diarizationResult.NumSpeakers)) diarization.AlignSpeakers(result, diarizationResult) } } // Format output formattedOutput, err := formatter.Format(result) if err != nil { fmt.Printf("Error formatting output: %v\n", err) continue } // Write to file or stdout if outputFile != "" { err := os.WriteFile(outputFile, []byte(formattedOutput), 0644) if err != nil { fmt.Printf("✗ Error writing output file: %v\n", err) } else { fmt.Printf("✓ Saved to %s\n", outputFile) } } else { fmt.Printf("\n%s\n", formattedOutput) } } }, } func init() { rootCmd.Version = Version rootCmd.PersistentFlags().StringVarP(&outputFile, "output", "o", "", "Output file path (required)") rootCmd.PersistentFlags().StringVarP(&outputFormat, "format", "f", "srt", "Output format: text, srt, json") rootCmd.PersistentFlags().BoolVar(&diarize, "diarize", false, "Enable speaker diarization") rootCmd.PersistentFlags().IntVarP(&numSpeakers, "speakers", "s", 0, "Number of speakers (0 = auto-detect)") rootCmd.PersistentFlags().StringVarP(&modelSize, "model", "m", "tiny", "Whisper model: tiny, base, small, medium, large, turbo") rootCmd.PersistentFlags().BoolVar(&noWrite, "no-write", false, "Print output to stdout instead of file") } // Execute adds all child commands to the root command and sets flags appropriately. func Execute() { if err := rootCmd.Execute(); err != nil { fmt.Println(err) os.Exit(1) } }