feat: git init

2026-01-17 19:18:58 -06:00
commit b73d5b8078
18 changed files with 1274 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 transcribe
 test/
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1,93 @@
 # Transcribe Tool
 Audio transcription CLI using OpenAI Whisper with speaker diarization.
 ## Quick Reference
 ```bash
 # Basic transcription (SRT output)
 ./transcribe audio.mp3 -o output.srt
 # With speaker diarization
 ./transcribe audio.mp3 --diarize -o output.srt
 # Specify model and speakers
 ./transcribe audio.mp3 --model small --diarize -s 2 -o output.srt
 # Print to stdout
 ./transcribe audio.mp3 --no-write
 ```
 ## Flags
 | Flag | Short | Description | Default |
 |------|-------|-------------|---------|
 | `--output` | `-o` | Output file path | **required** |
 | `--format` | `-f` | `srt`, `text`, `json` | `srt` |
 | `--model` | `-m` | `tiny`, `base`, `small`, `medium`, `large`, `turbo` | `tiny` |
 | `--diarize` | | Enable speaker detection | off |
 | `--speakers` | `-s` | Number of speakers (0=auto) | `0` |
 | `--no-write` | | Print to stdout instead of file | off |
 ## Common Tasks
 **Transcribe a meeting recording:**
 ```bash
 ./transcribe meeting.wav --model small -o meeting.srt
 ```
 **Transcribe interview with 2 speakers:**
 ```bash
 ./transcribe interview.mp3 --model small --diarize -s 2 -o interview.srt
 ```
 **Get JSON output for processing:**
 ```bash
 ./transcribe audio.mp3 --format json -o output.json
 ```
 **Quick preview (stdout):**
 ```bash
 ./transcribe audio.mp3 --no-write
 ```
 ## Output Formats
 **SRT (default):** Subtitle format with timestamps
 ```
 1
 00:00:00,000 --> 00:00:05,200
 [Speaker 1] Hello, how are you?
 ```
 **Text:** Plain text with timestamps
 ```
 [00:00.0 - 00:05.2] [Speaker 1] Hello, how are you?
 ```
 **JSON:** Full metadata including segments, words, duration
 ## Models
 - `tiny` - Fastest, use for quick drafts
 - `small` - Good balance of speed/accuracy
 - `medium` - Better accuracy, slower
 - `large` - Best accuracy, slowest
 ## Supported Formats
 MP3, WAV, FLAC, M4A, OGG, OPUS
 ## Build
 ```bash
 cd /home/yeho/Documents/tools/transcribe
 go build -o transcribe
 ```
 ## Dependencies
 ```bash
 pip install openai-whisper                      # Required
 pip install resemblyzer scikit-learn librosa    # For diarization
 ```
--- a/README.md
+++ b/README.md
@@ -0,0 +1,166 @@
 # Transcribe - Audio Transcription Tool
 A CLI tool for transcribing audio files using OpenAI's Whisper model with speaker diarization and multiple output formats.
 ## Features
 - Multiple Whisper model sizes (tiny, base, small, medium, large, turbo)
 - Speaker diarization using voice embeddings (resemblyzer + clustering)
 - Multiple output formats: SRT subtitles, plain text, JSON
 - Batch processing of multiple audio files
 - Automatic language detection
 - Progress indicators with spinners
 ## Installation
 ### Prerequisites
 - Go 1.20+
 - Python 3.8+
 - FFmpeg
 ### Python Dependencies
 ```bash
 # Required for transcription
 pip install openai-whisper
 # Required for speaker diarization
 pip install resemblyzer scikit-learn librosa
 ```
 Note: If `resemblyzer` fails to install due to `webrtcvad`, install Python development headers first:
 ```bash
 # Fedora/RHEL
 sudo dnf install python3-devel
 # Ubuntu/Debian
 sudo apt install python3-dev
 ```
 ### Build from Source
 ```bash
 go build -o transcribe
 ```
 ## Usage
 Output file (`-o`) is required unless `--no-write` is specified.
 ### Basic Transcription
 ```bash
 ./transcribe audio.mp3 -o output.srt
 ```
 ### Choose Whisper Model
 ```bash
 ./transcribe audio.mp3 --model small -o output.srt
 ```
 Available models: `tiny` (default), `base`, `small`, `medium`, `large`, `turbo`
 ### Output Formats
 **SRT subtitles (default):**
 ```bash
 ./transcribe audio.mp3 -o subtitles.srt
 ```
 **Plain text with timestamps:**
 ```bash
 ./transcribe audio.mp3 --format text -o output.txt
 ```
 **JSON:**
 ```bash
 ./transcribe audio.mp3 --format json -o output.json
 ```
 ### Speaker Diarization
 Enable automatic speaker detection:
 ```bash
 ./transcribe audio.mp3 --diarize -o output.srt
 ```
 Specify number of speakers for better accuracy:
 ```bash
 ./transcribe audio.mp3 --diarize --speakers 2 -o output.srt
 ```
 ### Print to stdout
 ```bash
 ./transcribe audio.mp3 --no-write
 ```
 ### Full Example
 Transcribe with speaker diarization:
 ```bash
 ./transcribe interview.wav --model small --diarize -s 2 -o interview.srt
 ```
 Output:
 ```
 1
 00:00:00,000 --> 00:00:05,200
 [Speaker 1] Hello, how are you?
 2
 00:00:05,200 --> 00:00:12,300
 [Speaker 2] I'm doing well, thanks!
 ```
 ## CLI Reference
 ```
 Usage:
  transcribe <audio files...> [flags]
 Flags:
      --diarize           Enable speaker diarization
  -f, --format string     Output format: srt, text, json (default "srt")
  -h, --help              help for transcribe
  -m, --model string      Whisper model: tiny, base, small, medium, large, turbo (default "tiny")
      --no-write          Print output to stdout instead of file
  -o, --output string     Output file path (required)
  -s, --speakers int      Number of speakers (0 = auto-detect)
 ```
 ## Supported Audio Formats
 MP3, WAV, FLAC, M4A, OGG, OPUS
 ## Architecture
 ```
 transcribe/
 ├── cmd/
 │   └── root.go              # CLI commands and flags
 ├── internal/
 │   ├── whisper/
 │   │   └── client.go        # Whisper Python bridge
 │   └── diarization/
 │       ├── client.go        # Diarization Python bridge
 │       └── align.go         # Speaker-segment alignment
 ├── pkg/
 │   ├── audio/
 │   │   └── audio.go         # Audio file validation
 │   ├── output/
 │   │   ├── formatter.go     # Output formatter interface
 │   │   ├── srt.go           # SRT format
 │   │   ├── text.go          # Text format
 │   │   └── json.go          # JSON format
 │   └── progress/
 │       └── spinner.go       # Progress spinner
 └── README.md
 ```
 ## How It Works
 1. **Transcription**: Audio is processed by Whisper (via Python subprocess) to generate timestamped text segments
 2. **Diarization** (optional): Voice embeddings are extracted using resemblyzer and clustered to identify speakers
 3. **Alignment**: Speaker segments are mapped to transcription segments by timestamp overlap
 4. **Formatting**: Results are formatted according to the selected output format (SRT by default)
 ## License
 MIT License - see LICENSE file for details.
--- a/1
+++ b/1
@@ -0,0 +1 @@
 0.1.0
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -0,0 +1,172 @@
 package cmd
 import (
 	"fmt"
 	"os"
 	"transcribe/internal/diarization"
 	"transcribe/internal/whisper"
 	"transcribe/pkg/audio"
 	"transcribe/pkg/output"
 	"transcribe/pkg/progress"
 	"github.com/spf13/cobra"
 )
 var Version = "dev"
 var outputFile string
 var outputFormat string
 var diarize bool
 var numSpeakers int
 var modelSize string
 var noWrite bool
 // rootCmd represents the base command when called without any subcommands
 var rootCmd = &cobra.Command{
 	Use:   "transcribe",
 	Short: "A CLI tool for transcribing audio files with speaker diarization",
 	Long: `Transcribe is a command-line tool that uses OpenAI's Whisper model to
 transcribe audio files. It supports multiple output formats (text, SRT, JSON)
 and speaker diarization using voice embeddings.
 Output file (-o) is required unless --no-write is specified.
 Output Formats:
  srt     SRT subtitle format (default)
  text    Plain text with timestamps
  json    JSON with full metadata
 Whisper Models (--model, -m):
  tiny    Fastest, least accurate (default)
  base    Fast, basic accuracy
  small   Balanced speed/accuracy
  medium  Good accuracy, slower
  large   Best accuracy, slowest
  turbo   Optimized for speed
 Examples:
  # Basic transcription to SRT
  transcribe audio.mp3 -o output.srt
  # Use a larger model
  transcribe audio.mp3 --model small -o output.srt
  # Output as plain text
  transcribe audio.mp3 --format text -o output.txt
  # Enable speaker diarization
  transcribe audio.mp3 --diarize -o output.srt
  # Print to stdout instead of file
  transcribe audio.mp3 --no-write
  # Full example: diarization + specific model
  transcribe audio.mp3 --model small --diarize -s 2 -o output.srt`,
 	Run: func(cmd *cobra.Command, args []string) {
 		if len(args) == 0 {
 			fmt.Println("Please provide audio files to transcribe")
 			_ = cmd.Help()
 			os.Exit(1)
 		}
 		// Require output file unless --no-write is set
 		if outputFile == "" && !noWrite {
 			fmt.Println("✗ Error: Output file required. Use -o <file> to specify output, or --no-write to print to stdout.")
 			os.Exit(1)
 		}
 		// Validate all provided files
 		for _, file := range args {
 			if _, err := os.Stat(file); os.IsNotExist(err) {
 				fmt.Printf("✗ Error: File '%s' does not exist\n", file)
 				os.Exit(1)
 			}
 			_, err := audio.NewAudioFile(file)
 			if err != nil {
 				fmt.Printf("✗ Error: File '%s' has unsupported format or error: %v\n", file, err)
 				os.Exit(1)
 			}
 		}
 		// Create whisper client and transcribe
 		whisperClient := whisper.NewClient(whisper.ModelSize(modelSize))
 		whisperOptions := whisper.DefaultTranscriptionOptions()
 		// Create diarization client if needed
 		var diarizationClient *diarization.Client
 		var diarizationOptions *diarization.DiarizationOptions
 		if diarize {
 			diarizationClient = diarization.NewClient()
 			diarizationOptions = &diarization.DiarizationOptions{
 				NumSpeakers: numSpeakers,
 			}
 		}
 		// Create output formatter
 		formatter := output.NewFormatter(output.FormatType(outputFormat))
 		for _, file := range args {
 			// Transcription with spinner
 			spinner := progress.NewSpinner(fmt.Sprintf("Transcribing %s (model: %s)...", file, modelSize))
 			spinner.Start()
 			result, err := whisperClient.Transcribe(file, whisperOptions)
 			if err != nil {
 				spinner.StopWithMessage(fmt.Sprintf("✗ Error transcribing %s: %v", file, err))
 				continue
 			}
 			spinner.StopWithMessage(fmt.Sprintf("✓ Transcribed %s (%.1fs audio)", file, result.Duration))
 			// Run diarization if enabled
 			if diarize {
 				spinner := progress.NewSpinner("Detecting speakers...")
 				spinner.Start()
 				diarizationResult, err := diarizationClient.Diarize(file, diarizationOptions)
 				if err != nil {
 					spinner.StopWithMessage(fmt.Sprintf("✗ Diarization failed: %v", err))
 				} else {
 					spinner.StopWithMessage(fmt.Sprintf("✓ Detected %d speaker(s)", diarizationResult.NumSpeakers))
 					diarization.AlignSpeakers(result, diarizationResult)
 				}
 			}
 			// Format output
 			formattedOutput, err := formatter.Format(result)
 			if err != nil {
 				fmt.Printf("Error formatting output: %v\n", err)
 				continue
 			}
 			// Write to file or stdout
 			if outputFile != "" {
 				err := os.WriteFile(outputFile, []byte(formattedOutput), 0644)
 				if err != nil {
 					fmt.Printf("✗ Error writing output file: %v\n", err)
 				} else {
 					fmt.Printf("✓ Saved to %s\n", outputFile)
 				}
 			} else {
 				fmt.Printf("\n%s\n", formattedOutput)
 			}
 		}
 	},
 }
 func init() {
 	rootCmd.Version = Version
 	rootCmd.PersistentFlags().StringVarP(&outputFile, "output", "o", "", "Output file path (required)")
 	rootCmd.PersistentFlags().StringVarP(&outputFormat, "format", "f", "srt", "Output format: text, srt, json")
 	rootCmd.PersistentFlags().BoolVar(&diarize, "diarize", false, "Enable speaker diarization")
 	rootCmd.PersistentFlags().IntVarP(&numSpeakers, "speakers", "s", 0, "Number of speakers (0 = auto-detect)")
 	rootCmd.PersistentFlags().StringVarP(&modelSize, "model", "m", "tiny", "Whisper model: tiny, base, small, medium, large, turbo")
 	rootCmd.PersistentFlags().BoolVar(&noWrite, "no-write", false, "Print output to stdout instead of file")
 }
 // Execute adds all child commands to the root command and sets flags appropriately.
 func Execute() {
 	if err := rootCmd.Execute(); err != nil {
 		fmt.Println(err)
 		os.Exit(1)
 	}
 }
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,28 @@
 module transcribe
 go 1.25.4
 require (
 	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
 	github.com/charmbracelet/bubbletea v1.3.10 // indirect
 	github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc // indirect
 	github.com/charmbracelet/lipgloss v1.1.0 // indirect
 	github.com/charmbracelet/x/ansi v0.10.1 // indirect
 	github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd // indirect
 	github.com/charmbracelet/x/term v0.2.1 // indirect
 	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/mattn/go-localereader v0.0.1 // indirect
 	github.com/mattn/go-runewidth v0.0.16 // indirect
 	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
 	github.com/muesli/cancelreader v0.2.2 // indirect
 	github.com/muesli/termenv v0.16.0 // indirect
 	github.com/rivo/uniseg v0.4.7 // indirect
 	github.com/spf13/cobra v1.10.2 // indirect
 	github.com/spf13/pflag v1.0.9 // indirect
 	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
 	golang.org/x/sys v0.36.0 // indirect
 	golang.org/x/text v0.3.8 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -0,0 +1,51 @@
 github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
 github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
 github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
 github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc h1:4pZI35227imm7yK2bGPcfpFEmuY1gc2YSTShr4iJBfs=
 github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc/go.mod h1:X4/0JoqgTIPSFcRA/P6INZzIuyqdFY5rm8tb41s9okk=
 github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
 github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
 github.com/charmbracelet/x/ansi v0.10.1 h1:rL3Koar5XvX0pHGfovN03f5cxLbCF2YvLeyz7D2jVDQ=
 github.com/charmbracelet/x/ansi v0.10.1/go.mod h1:3RQDQ6lDnROptfpWuUVIUG64bD2g2BgntdxH0Ya5TeE=
 github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd h1:vy0GVL4jeHEwG5YOXDmi86oYw2yuYUGqz6a8sLwg0X8=
 github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd/go.mod h1:xe0nKWGd3eJgtqZRaN9RjMtK7xUYchjzPr7q6kcvCCs=
 github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
 github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
 github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
 github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
 github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
 github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
 github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
 github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
 github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
 github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
 github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
 github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
 github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
 github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU=
 github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4=
 github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
 github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
 github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
 go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
 golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
 golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY=
 golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
--- a/install.sh
+++ b/install.sh
@@ -0,0 +1,27 @@
 #!/bin/bash
 set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 INSTALL_DIR="$HOME/.local/bin"
 cd "$SCRIPT_DIR"
 VERSION="$(cat "$SCRIPT_DIR/VERSION")"
 echo "Building transcribe (version: $VERSION)..."
 go build -ldflags "-X transcribe/cmd.Version=$VERSION" -o transcribe .
 echo "Installing to $INSTALL_DIR..."
 mkdir -p "$INSTALL_DIR"
 cp transcribe "$INSTALL_DIR/"
 chmod +x "$INSTALL_DIR/transcribe"
 if [[ ":$PATH:" != *":$HOME/.local/bin:"* ]]; then
    echo ""
    echo "Warning: ~/.local/bin is not in your PATH"
    echo "Add this to your shell rc file (e.g., ~/.bashrc or ~/.zshrc):"
    echo '  export PATH="$HOME/.local/bin:$PATH"'
 fi
 echo ""
 echo "Installed successfully!"
--- a/internal/diarization/align.go
+++ b/internal/diarization/align.go
@@ -0,0 +1,59 @@
 package diarization
 import (
 	"transcribe/internal/whisper"
 )
 // AlignSpeakers maps speaker segments to transcription segments by timestamp overlap
 func AlignSpeakers(transcription *whisper.TranscriptionResult, diarization *DiarizationResult) {
 	if diarization == nil || len(diarization.Speakers) == 0 {
 		return
 	}
 	for i := range transcription.Segments {
 		seg := &transcription.Segments[i]
 		speaker := findSpeakerForSegment(seg.Start, seg.End, diarization.Speakers)
 		seg.Speaker = speaker
 	}
 }
 // findSpeakerForSegment finds the speaker with the most overlap with the given time range
 func findSpeakerForSegment(start, end float64, speakers []SpeakerSegment) string {
 	var bestSpeaker string
 	var maxOverlap float64
 	for _, spk := range speakers {
 		overlap := calculateOverlap(start, end, spk.Start, spk.End)
 		if overlap > maxOverlap {
 			maxOverlap = overlap
 			bestSpeaker = spk.Speaker
 		}
 	}
 	return bestSpeaker
 }
 // calculateOverlap returns the duration of overlap between two time ranges
 func calculateOverlap(start1, end1, start2, end2 float64) float64 {
 	overlapStart := max(start1, start2)
 	overlapEnd := min(end1, end2)
 	if overlapEnd > overlapStart {
 		return overlapEnd - overlapStart
 	}
 	return 0
 }
 func max(a, b float64) float64 {
 	if a > b {
 		return a
 	}
 	return b
 }
 func min(a, b float64) float64 {
 	if a < b {
 		return a
 	}
 	return b
 }
--- a/internal/diarization/client.go
+++ b/internal/diarization/client.go
@@ -0,0 +1,222 @@
 package diarization
 import (
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"os/exec"
 )
 // SpeakerSegment represents a segment with speaker identification
 type SpeakerSegment struct {
 	Speaker string  `json:"speaker"` // "Speaker 1", "Speaker 2", etc.
 	Start   float64 `json:"start"`
 	End     float64 `json:"end"`
 }
 // DiarizationResult contains the speaker diarization output
 type DiarizationResult struct {
 	Speakers    []SpeakerSegment `json:"speakers"`
 	NumSpeakers int              `json:"num_speakers"`
 }
 // Client handles speaker diarization using resemblyzer
 type Client struct{}
 // NewClient creates a new diarization client
 func NewClient() *Client {
 	return &Client{}
 }
 // DiarizationOptions contains options for diarization
 type DiarizationOptions struct {
 	NumSpeakers int // Number of speakers (0 = auto-detect)
 }
 // DefaultDiarizationOptions returns default diarization options
 func DefaultDiarizationOptions() *DiarizationOptions {
 	return &DiarizationOptions{
 		NumSpeakers: 0, // Auto-detect
 	}
 }
 // Diarize processes an audio file and returns speaker segments
 func (c *Client) Diarize(audioPath string, options *DiarizationOptions) (*DiarizationResult, error) {
 	if options == nil {
 		options = DefaultDiarizationOptions()
 	}
 	// Build the Python command
 	cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options))
 	// Capture stdout and stderr
 	var out bytes.Buffer
 	var errBuf bytes.Buffer
 	cmd.Stdout = &out
 	cmd.Stderr = &errBuf
 	// Execute the command
 	err := cmd.Run()
 	if err != nil {
 		return nil, fmt.Errorf("diarization failed: %v, stderr: %s", err, errBuf.String())
 	}
 	// Parse the JSON output
 	var result DiarizationResult
 	err = json.Unmarshal(out.Bytes(), &result)
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse diarization output: %v, output: %s", err, out.String())
 	}
 	return &result, nil
 }
 // buildPythonCommand constructs the Python command for diarization
 func (c *Client) buildPythonCommand(audioPath string, options *DiarizationOptions) string {
 	numSpeakersStr := "None"
 	if options.NumSpeakers > 0 {
 		numSpeakersStr = fmt.Sprintf("%d", options.NumSpeakers)
 	}
 	pythonCode := fmt.Sprintf(`
 import json
 import sys
 import os
 import warnings
 import numpy as np
 # Suppress warnings
 warnings.filterwarnings("ignore")
 # Redirect both stdout and stderr during imports to suppress library noise
 old_stdout = sys.stdout
 old_stderr = sys.stderr
 sys.stdout = open(os.devnull, 'w')
 sys.stderr = open(os.devnull, 'w')
 from resemblyzer import VoiceEncoder, preprocess_wav
 from sklearn.cluster import SpectralClustering, AgglomerativeClustering
 import librosa
 # Initialize voice encoder while stdout is suppressed (it prints loading message)
 encoder = VoiceEncoder()
 # Restore stdout/stderr
 sys.stdout = old_stdout
 sys.stderr = old_stderr
 # Configuration
 AUDIO_PATH = "%s"
 NUM_SPEAKERS = %s
 SEGMENT_DURATION = 1.5  # seconds per segment for embedding extraction
 HOP_DURATION = 0.75     # hop between segments
 # Load audio
 audio, sr = librosa.load(AUDIO_PATH, sr=16000)
 duration = len(audio) / sr
 # Extract embeddings for overlapping segments
 embeddings = []
 timestamps = []
 current_time = 0.0
 while current_time + SEGMENT_DURATION <= duration:
    start_sample = int(current_time * sr)
    end_sample = int((current_time + SEGMENT_DURATION) * sr)
    segment = audio[start_sample:end_sample]
    # Skip silent segments
    if np.abs(segment).mean() > 0.01:
        try:
            wav = preprocess_wav(segment, source_sr=sr)
            if len(wav) > 0:
                embedding = encoder.embed_utterance(wav)
                embeddings.append(embedding)
                timestamps.append((current_time, current_time + SEGMENT_DURATION))
        except:
            pass
    current_time += HOP_DURATION
 # Handle edge cases
 if len(embeddings) == 0:
    print(json.dumps({"speakers": [], "num_speakers": 0}))
    sys.exit(0)
 embeddings = np.array(embeddings)
 # Determine number of speakers
 if NUM_SPEAKERS is None or NUM_SPEAKERS <= 0:
    # Auto-detect using silhouette score
    from sklearn.metrics import silhouette_score
    best_n = 2
    best_score = -1
    for n in range(2, min(6, len(embeddings))):
        try:
            clustering = AgglomerativeClustering(n_clusters=n)
            labels = clustering.fit_predict(embeddings)
            score = silhouette_score(embeddings, labels)
            if score > best_score:
                best_score = score
                best_n = n
        except:
            pass
    num_speakers = best_n
 else:
    num_speakers = NUM_SPEAKERS
 # Cluster embeddings
 try:
    if len(embeddings) >= num_speakers:
        clustering = AgglomerativeClustering(n_clusters=num_speakers)
        labels = clustering.fit_predict(embeddings)
    else:
        labels = list(range(len(embeddings)))
        num_speakers = len(embeddings)
 except Exception as e:
    labels = [0] * len(embeddings)
    num_speakers = 1
 # Build speaker segments with merging of consecutive same-speaker segments
 speaker_segments = []
 prev_speaker = None
 prev_start = None
 prev_end = None
 for i, (start, end) in enumerate(timestamps):
    speaker = f"Speaker {labels[i] + 1}"
    if speaker == prev_speaker and prev_end is not None:
        # Extend previous segment if same speaker and close in time
        if start - prev_end < 0.5:
            prev_end = end
            continue
    # Save previous segment
    if prev_speaker is not None:
        speaker_segments.append({
            "speaker": prev_speaker,
            "start": prev_start,
            "end": prev_end
        })
    prev_speaker = speaker
    prev_start = start
    prev_end = end
 # Don't forget the last segment
 if prev_speaker is not None:
    speaker_segments.append({
        "speaker": prev_speaker,
        "start": prev_start,
        "end": prev_end
    })
 print(json.dumps({
    "speakers": speaker_segments,
    "num_speakers": num_speakers
 }))
 `, audioPath, numSpeakersStr)
 	return pythonCode
 }
--- a/internal/whisper/client.go
+++ b/internal/whisper/client.go
@@ -0,0 +1,162 @@
 package whisper
 import (
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"os/exec"
 )
 // ModelSize represents the different Whisper model sizes
 type ModelSize string
 const (
 	ModelTiny   ModelSize = "tiny"
 	ModelBase   ModelSize = "base"
 	ModelSmall  ModelSize = "small"
 	ModelMedium ModelSize = "medium"
 	ModelLarge  ModelSize = "large"
 	ModelTurbo  ModelSize = "turbo"
 )
 // TranscriptionResult contains the transcription output
 type TranscriptionResult struct {
 	Text     string    `json:"text"`
 	Segments []Segment `json:"segments"`
 	Language string    `json:"language"`
 	Duration float64   `json:"duration"`
 }
 // Segment represents a segment of transcription with timestamps
 type Segment struct {
 	Start   float64 `json:"start"`
 	End     float64 `json:"end"`
 	Text    string  `json:"text"`
 	Words   []Word  `json:"words,omitempty"`
 	Speaker string  `json:"speaker,omitempty"`
 }
 // Word represents a word with timestamp
 type Word struct {
 	Start float64 `json:"start"`
 	End   float64 `json:"end"`
 	Word  string  `json:"word"`
 }
 // Client is the Whisper client that handles transcription
 type Client struct {
 	ModelPath string
 	ModelSize ModelSize
 }
 // NewClient creates a new Whisper client
 func NewClient(modelSize ModelSize) *Client {
 	return &Client{
 		ModelSize: modelSize,
 	}
 }
 // Transcribe processes an audio file and returns transcription
 func (c *Client) Transcribe(audioPath string, options *TranscriptionOptions) (*TranscriptionResult, error) {
 	if options == nil {
 		options = &TranscriptionOptions{}
 	}
 	// Build the Python command
 	cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options))
 	// Capture stdout and stderr
 	var out bytes.Buffer
 	var errBuf bytes.Buffer
 	cmd.Stdout = &out
 	cmd.Stderr = &errBuf
 	// Execute the command
 	err := cmd.Run()
 	if err != nil {
 		return nil, fmt.Errorf("transcription failed: %v, stderr: %s", err, errBuf.String())
 	}
 	// Parse the JSON output
 	var result TranscriptionResult
 	err = json.Unmarshal(out.Bytes(), &result)
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse transcription output: %v", err)
 	}
 	return &result, nil
 }
 // buildPythonCommand constructs the Python command for Whisper
 func (c *Client) buildPythonCommand(audioPath string, options *TranscriptionOptions) string {
 	// Convert Go bool to Python bool string
 	verboseStr := "False"
 	if options.Verbose {
 		verboseStr = "True"
 	}
 	// Handle language option
 	langStr := "None"
 	if options.Language != "" && options.Language != "auto" {
 		langStr = fmt.Sprintf(`"%s"`, options.Language)
 	}
 	pythonCode := fmt.Sprintf(`
 import whisper
 import json
 import sys
 import os
 import warnings
 # Suppress warnings and stdout during transcription
 warnings.filterwarnings("ignore")
 old_stdout = sys.stdout
 sys.stdout = open(os.devnull, 'w')
 # Load model
 model = whisper.load_model("%s")
 # Transcribe
 result = model.transcribe("%s",
    language=%s,
    verbose=%s,
    temperature=%.1f,
    best_of=%d)
 # Restore stdout for JSON output
 sys.stdout = old_stdout
 # Output as JSON
 print(json.dumps({
    "text": result["text"],
    "language": result.get("language", ""),
    "duration": result.get("duration", 0.0),
    "segments": [{
        "start": seg["start"],
        "end": seg["end"],
        "text": seg["text"],
        "words": seg.get("words", [])
    } for seg in result.get("segments", [])]
 }))
 `, c.ModelSize, audioPath, langStr, verboseStr, options.Temperature, options.BestOf)
 	return pythonCode
 }
 // TranscriptionOptions contains options for transcription
 type TranscriptionOptions struct {
 	Language    string  // Language code or "auto"
 	Verbose     bool    // Show progress bar
 	Temperature float64 // Temperature for sampling (higher = more creative)
 	BestOf      int     // Number of candidates when sampling with temperature > 0
 }
 // DefaultTranscriptionOptions returns default transcription options
 func DefaultTranscriptionOptions() *TranscriptionOptions {
 	return &TranscriptionOptions{
 		Language:    "auto",
 		Verbose:     false,
 		Temperature: 0.0,
 		BestOf:      5,
 	}
 }
--- a/main.go
+++ b/main.go
@@ -0,0 +1,9 @@
 package main
 import (
 	"transcribe/cmd"
 )
 func main() {
 	cmd.Execute()
 }
--- a/pkg/audio/audio.go
+++ b/pkg/audio/audio.go
@@ -0,0 +1,56 @@
 package audio
 import (
 	"errors"
 	"os"
 	"path/filepath"
 	"strings"
 )
 // SupportedAudioFormats lists the audio formats that can be processed
 type SupportedAudioFormats []string
 var DefaultSupportedFormats = SupportedAudioFormats{
 	".mp3",
 	".wav",
 	".flac",
 	".m4a",
 	".ogg",
 	".opus",
 }
 // IsSupported checks if a file has a supported audio format
 type AudioFile struct {
 	Path   string
 	Format string
 	Size   int64
 }
 func NewAudioFile(path string) (*AudioFile, error) {
 	fileInfo, err := os.Stat(path)
 	if err != nil {
 		return nil, err
 	}
 	ext := filepath.Ext(path)
 	if !IsSupported(ext) {
 		return nil, errors.New("unsupported audio format: " + ext)
 	}
 	return &AudioFile{
 		Path:   path,
 		Format: ext,
 		Size:   fileInfo.Size(),
 	}, nil
 }
 // IsSupported checks if the given extension is in supported formats
 func IsSupported(ext string) bool {
 	ext = strings.ToLower(ext)
 	for _, format := range DefaultSupportedFormats {
 		if ext == format {
 			return true
 		}
 	}
 	return false
 }
--- a/pkg/output/formatter.go
+++ b/pkg/output/formatter.go
@@ -0,0 +1,33 @@
 package output
 import (
 	"transcribe/internal/whisper"
 )
 // Formatter interface for converting transcription results to various output formats
 type Formatter interface {
 	Format(result *whisper.TranscriptionResult) (string, error)
 }
 // FormatType represents the output format type
 type FormatType string
 const (
 	FormatText FormatType = "text"
 	FormatSRT  FormatType = "srt"
 	FormatJSON FormatType = "json"
 )
 // NewFormatter creates a formatter for the given format type
 func NewFormatter(format FormatType) Formatter {
 	switch format {
 	case FormatSRT:
 		return &SRTFormatter{}
 	case FormatJSON:
 		return &JSONFormatter{}
 	case FormatText:
 		fallthrough
 	default:
 		return &TextFormatter{}
 	}
 }
--- a/pkg/output/json.go
+++ b/pkg/output/json.go
@@ -0,0 +1,19 @@
 package output
 import (
 	"encoding/json"
 	"transcribe/internal/whisper"
 )
 // JSONFormatter formats transcription results as JSON
 type JSONFormatter struct{}
 // Format converts transcription result to JSON format
 func (f *JSONFormatter) Format(result *whisper.TranscriptionResult) (string, error) {
 	data, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return "", err
 	}
 	return string(data), nil
 }
--- a/pkg/output/srt.go
+++ b/pkg/output/srt.go
@@ -0,0 +1,49 @@
 package output
 import (
 	"fmt"
 	"strings"
 	"transcribe/internal/whisper"
 )
 // SRTFormatter formats transcription results as SRT subtitles
 type SRTFormatter struct{}
 // Format converts transcription result to SRT format
 func (f *SRTFormatter) Format(result *whisper.TranscriptionResult) (string, error) {
 	var builder strings.Builder
 	for i, seg := range result.Segments {
 		// Subtitle number (1-indexed)
 		builder.WriteString(fmt.Sprintf("%d\n", i+1))
 		// Timestamps in SRT format: HH:MM:SS,mmm --> HH:MM:SS,mmm
 		startTime := formatSRTTimestamp(seg.Start)
 		endTime := formatSRTTimestamp(seg.End)
 		builder.WriteString(fmt.Sprintf("%s --> %s\n", startTime, endTime))
 		// Text with optional speaker label
 		text := strings.TrimSpace(seg.Text)
 		if seg.Speaker != "" {
 			text = fmt.Sprintf("[%s] %s", seg.Speaker, text)
 		}
 		builder.WriteString(text)
 		builder.WriteString("\n\n")
 	}
 	return strings.TrimSuffix(builder.String(), "\n"), nil
 }
 // formatSRTTimestamp converts seconds to SRT timestamp format (HH:MM:SS,mmm)
 func formatSRTTimestamp(seconds float64) string {
 	totalMs := int64(seconds * 1000)
 	ms := totalMs % 1000
 	totalSeconds := totalMs / 1000
 	s := totalSeconds % 60
 	totalMinutes := totalSeconds / 60
 	m := totalMinutes % 60
 	h := totalMinutes / 60
 	return fmt.Sprintf("%02d:%02d:%02d,%03d", h, m, s, ms)
 }
--- a/pkg/output/text.go
+++ b/pkg/output/text.go
@@ -0,0 +1,41 @@
 package output
 import (
 	"fmt"
 	"strings"
 	"transcribe/internal/whisper"
 )
 // TextFormatter formats transcription results as plain text with timestamps
 type TextFormatter struct{}
 // Format converts transcription result to plain text with timestamps
 func (f *TextFormatter) Format(result *whisper.TranscriptionResult) (string, error) {
 	var builder strings.Builder
 	for _, seg := range result.Segments {
 		// Format: [MM:SS - MM:SS] [Speaker] Text
 		startTime := formatTextTimestamp(seg.Start)
 		endTime := formatTextTimestamp(seg.End)
 		text := strings.TrimSpace(seg.Text)
 		if seg.Speaker != "" {
 			builder.WriteString(fmt.Sprintf("[%s - %s] [%s] %s\n", startTime, endTime, seg.Speaker, text))
 		} else {
 			builder.WriteString(fmt.Sprintf("[%s - %s] %s\n", startTime, endTime, text))
 		}
 	}
 	return strings.TrimSuffix(builder.String(), "\n"), nil
 }
 // formatTextTimestamp converts seconds to MM:SS.s format
 func formatTextTimestamp(seconds float64) string {
 	totalSeconds := int(seconds)
 	m := totalSeconds / 60
 	s := totalSeconds % 60
 	tenths := int((seconds - float64(totalSeconds)) * 10)
 	return fmt.Sprintf("%02d:%02d.%d", m, s, tenths)
 }
--- a/pkg/progress/spinner.go
+++ b/pkg/progress/spinner.go
@@ -0,0 +1,84 @@
 package progress
 import (
 	"fmt"
 	"sync"
 	"time"
 )
 // Spinner displays an animated spinner with a message
 type Spinner struct {
 	message  string
 	frames   []string
 	interval time.Duration
 	stop     chan struct{}
 	done     chan struct{}
 	mu       sync.Mutex
 	running  bool
 }
 // NewSpinner creates a new spinner with the given message
 func NewSpinner(message string) *Spinner {
 	return &Spinner{
 		message:  message,
 		frames:   []string{"⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"},
 		interval: 80 * time.Millisecond,
 		stop:     make(chan struct{}),
 		done:     make(chan struct{}),
 	}
 }
 // Start begins the spinner animation
 func (s *Spinner) Start() {
 	s.mu.Lock()
 	if s.running {
 		s.mu.Unlock()
 		return
 	}
 	s.running = true
 	s.mu.Unlock()
 	go func() {
 		i := 0
 		for {
 			select {
 			case <-s.stop:
 				// Clear the line and signal done
 				fmt.Print("\r\033[K")
 				close(s.done)
 				return
 			default:
 				fmt.Printf("\r%s %s", s.frames[i%len(s.frames)], s.message)
 				i++
 				time.Sleep(s.interval)
 			}
 		}
 	}()
 }
 // Stop stops the spinner and clears the line
 func (s *Spinner) Stop() {
 	s.mu.Lock()
 	if !s.running {
 		s.mu.Unlock()
 		return
 	}
 	s.running = false
 	s.mu.Unlock()
 	close(s.stop)
 	<-s.done
 }
 // StopWithMessage stops the spinner and prints a final message
 func (s *Spinner) StopWithMessage(message string) {
 	s.Stop()
 	fmt.Println(message)
 }
 // UpdateMessage updates the spinner message while running
 func (s *Spinner) UpdateMessage(message string) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.message = message
 }