feat: git init

2026-01-17 19:18:58 -06:00
commit b73d5b8078
18 changed files with 1274 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+transcribe
+test/
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1,93 @@
+# Transcribe Tool
+
+Audio transcription CLI using OpenAI Whisper with speaker diarization.
+
+## Quick Reference
+
+```bash
+# Basic transcription (SRT output)
+./transcribe audio.mp3 -o output.srt
+
+# With speaker diarization
+./transcribe audio.mp3 --diarize -o output.srt
+
+# Specify model and speakers
+./transcribe audio.mp3 --model small --diarize -s 2 -o output.srt
+
+# Print to stdout
+./transcribe audio.mp3 --no-write
+```
+
+## Flags
+
+| Flag | Short | Description | Default |
+|------|-------|-------------|---------|
+| `--output` | `-o` | Output file path | **required** |
+| `--format` | `-f` | `srt`, `text`, `json` | `srt` |
+| `--model` | `-m` | `tiny`, `base`, `small`, `medium`, `large`, `turbo` | `tiny` |
+| `--diarize` | | Enable speaker detection | off |
+| `--speakers` | `-s` | Number of speakers (0=auto) | `0` |
+| `--no-write` | | Print to stdout instead of file | off |
+
+## Common Tasks
+
+**Transcribe a meeting recording:**
+```bash
+./transcribe meeting.wav --model small -o meeting.srt
+```
+
+**Transcribe interview with 2 speakers:**
+```bash
+./transcribe interview.mp3 --model small --diarize -s 2 -o interview.srt
+```
+
+**Get JSON output for processing:**
+```bash
+./transcribe audio.mp3 --format json -o output.json
+```
+
+**Quick preview (stdout):**
+```bash
+./transcribe audio.mp3 --no-write
+```
+
+## Output Formats
+
+**SRT (default):** Subtitle format with timestamps
+```
+1
+00:00:00,000 --> 00:00:05,200
+[Speaker 1] Hello, how are you?
+```
+
+**Text:** Plain text with timestamps
+```
+[00:00.0 - 00:05.2] [Speaker 1] Hello, how are you?
+```
+
+**JSON:** Full metadata including segments, words, duration
+
+## Models
+
+- `tiny` - Fastest, use for quick drafts
+- `small` - Good balance of speed/accuracy
+- `medium` - Better accuracy, slower
+- `large` - Best accuracy, slowest
+
+## Supported Formats
+
+MP3, WAV, FLAC, M4A, OGG, OPUS
+
+## Build
+
+```bash
+cd /home/yeho/Documents/tools/transcribe
+go build -o transcribe
+```
+
+## Dependencies
+
+```bash
+pip install openai-whisper                      # Required
+pip install resemblyzer scikit-learn librosa    # For diarization
+```
--- a/README.md
+++ b/README.md
@@ -0,0 +1,166 @@
+# Transcribe - Audio Transcription Tool
+
+A CLI tool for transcribing audio files using OpenAI's Whisper model with speaker diarization and multiple output formats.
+
+## Features
+
+- Multiple Whisper model sizes (tiny, base, small, medium, large, turbo)
+- Speaker diarization using voice embeddings (resemblyzer + clustering)
+- Multiple output formats: SRT subtitles, plain text, JSON
+- Batch processing of multiple audio files
+- Automatic language detection
+- Progress indicators with spinners
+
+## Installation
+
+### Prerequisites
+- Go 1.20+
+- Python 3.8+
+- FFmpeg
+
+### Python Dependencies
+```bash
+# Required for transcription
+pip install openai-whisper
+
+# Required for speaker diarization
+pip install resemblyzer scikit-learn librosa
+```
+
+Note: If `resemblyzer` fails to install due to `webrtcvad`, install Python development headers first:
+```bash
+# Fedora/RHEL
+sudo dnf install python3-devel
+
+# Ubuntu/Debian
+sudo apt install python3-dev
+```
+
+### Build from Source
+```bash
+go build -o transcribe
+```
+
+## Usage
+
+Output file (`-o`) is required unless `--no-write` is specified.
+
+### Basic Transcription
+```bash
+./transcribe audio.mp3 -o output.srt
+```
+
+### Choose Whisper Model
+```bash
+./transcribe audio.mp3 --model small -o output.srt
+```
+
+Available models: `tiny` (default), `base`, `small`, `medium`, `large`, `turbo`
+
+### Output Formats
+
+**SRT subtitles (default):**
+```bash
+./transcribe audio.mp3 -o subtitles.srt
+```
+
+**Plain text with timestamps:**
+```bash
+./transcribe audio.mp3 --format text -o output.txt
+```
+
+**JSON:**
+```bash
+./transcribe audio.mp3 --format json -o output.json
+```
+
+### Speaker Diarization
+
+Enable automatic speaker detection:
+```bash
+./transcribe audio.mp3 --diarize -o output.srt
+```
+
+Specify number of speakers for better accuracy:
+```bash
+./transcribe audio.mp3 --diarize --speakers 2 -o output.srt
+```
+
+### Print to stdout
+```bash
+./transcribe audio.mp3 --no-write
+```
+
+### Full Example
+
+Transcribe with speaker diarization:
+```bash
+./transcribe interview.wav --model small --diarize -s 2 -o interview.srt
+```
+
+Output:
+```
+1
+00:00:00,000 --> 00:00:05,200
+[Speaker 1] Hello, how are you?
+
+2
+00:00:05,200 --> 00:00:12,300
+[Speaker 2] I'm doing well, thanks!
+```
+
+## CLI Reference
+
+```
+Usage:
+  transcribe <audio files...> [flags]
+
+Flags:
+      --diarize           Enable speaker diarization
+  -f, --format string     Output format: srt, text, json (default "srt")
+  -h, --help              help for transcribe
+  -m, --model string      Whisper model: tiny, base, small, medium, large, turbo (default "tiny")
+      --no-write          Print output to stdout instead of file
+  -o, --output string     Output file path (required)
+  -s, --speakers int      Number of speakers (0 = auto-detect)
+```
+
+## Supported Audio Formats
+
+MP3, WAV, FLAC, M4A, OGG, OPUS
+
+## Architecture
+
+```
+transcribe/
+├── cmd/
+│   └── root.go              # CLI commands and flags
+├── internal/
+│   ├── whisper/
+│   │   └── client.go        # Whisper Python bridge
+│   └── diarization/
+│       ├── client.go        # Diarization Python bridge
+│       └── align.go         # Speaker-segment alignment
+├── pkg/
+│   ├── audio/
+│   │   └── audio.go         # Audio file validation
+│   ├── output/
+│   │   ├── formatter.go     # Output formatter interface
+│   │   ├── srt.go           # SRT format
+│   │   ├── text.go          # Text format
+│   │   └── json.go          # JSON format
+│   └── progress/
+│       └── spinner.go       # Progress spinner
+└── README.md
+```
+
+## How It Works
+
+1. **Transcription**: Audio is processed by Whisper (via Python subprocess) to generate timestamped text segments
+2. **Diarization** (optional): Voice embeddings are extracted using resemblyzer and clustered to identify speakers
+3. **Alignment**: Speaker segments are mapped to transcription segments by timestamp overlap
+4. **Formatting**: Results are formatted according to the selected output format (SRT by default)
+
+## License
+
+MIT License - see LICENSE file for details.
--- a/1
+++ b/1
@@ -0,0 +1 @@
+0.1.0
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -0,0 +1,172 @@
+package cmd
+
+import (
+	"fmt"
+	"os"
+
+	"transcribe/internal/diarization"
+	"transcribe/internal/whisper"
+	"transcribe/pkg/audio"
+	"transcribe/pkg/output"
+	"transcribe/pkg/progress"
+
+	"github.com/spf13/cobra"
+)
+
+var Version = "dev"
+
+var outputFile string
+var outputFormat string
+var diarize bool
+var numSpeakers int
+var modelSize string
+var noWrite bool
+
+// rootCmd represents the base command when called without any subcommands
+var rootCmd = &cobra.Command{
+	Use:   "transcribe",
+	Short: "A CLI tool for transcribing audio files with speaker diarization",
+	Long: `Transcribe is a command-line tool that uses OpenAI's Whisper model to
+transcribe audio files. It supports multiple output formats (text, SRT, JSON)
+and speaker diarization using voice embeddings.
+
+Output file (-o) is required unless --no-write is specified.
+
+Output Formats:
+  srt     SRT subtitle format (default)
+  text    Plain text with timestamps
+  json    JSON with full metadata
+
+Whisper Models (--model, -m):
+  tiny    Fastest, least accurate (default)
+  base    Fast, basic accuracy
+  small   Balanced speed/accuracy
+  medium  Good accuracy, slower
+  large   Best accuracy, slowest
+  turbo   Optimized for speed
+
+Examples:
+  # Basic transcription to SRT
+  transcribe audio.mp3 -o output.srt
+
+  # Use a larger model
+  transcribe audio.mp3 --model small -o output.srt
+
+  # Output as plain text
+  transcribe audio.mp3 --format text -o output.txt
+
+  # Enable speaker diarization
+  transcribe audio.mp3 --diarize -o output.srt
+
+  # Print to stdout instead of file
+  transcribe audio.mp3 --no-write
+
+  # Full example: diarization + specific model
+  transcribe audio.mp3 --model small --diarize -s 2 -o output.srt`,
+	Run: func(cmd *cobra.Command, args []string) {
+		if len(args) == 0 {
+			fmt.Println("Please provide audio files to transcribe")
+			_ = cmd.Help()
+			os.Exit(1)
+		}
+
+		// Require output file unless --no-write is set
+		if outputFile == "" && !noWrite {
+			fmt.Println("✗ Error: Output file required. Use -o <file> to specify output, or --no-write to print to stdout.")
+			os.Exit(1)
+		}
+
+		// Validate all provided files
+		for _, file := range args {
+			if _, err := os.Stat(file); os.IsNotExist(err) {
+				fmt.Printf("✗ Error: File '%s' does not exist\n", file)
+				os.Exit(1)
+			}
+
+			_, err := audio.NewAudioFile(file)
+			if err != nil {
+				fmt.Printf("✗ Error: File '%s' has unsupported format or error: %v\n", file, err)
+				os.Exit(1)
+			}
+		}
+
+		// Create whisper client and transcribe
+		whisperClient := whisper.NewClient(whisper.ModelSize(modelSize))
+		whisperOptions := whisper.DefaultTranscriptionOptions()
+
+		// Create diarization client if needed
+		var diarizationClient *diarization.Client
+		var diarizationOptions *diarization.DiarizationOptions
+		if diarize {
+			diarizationClient = diarization.NewClient()
+			diarizationOptions = &diarization.DiarizationOptions{
+				NumSpeakers: numSpeakers,
+			}
+		}
+
+		// Create output formatter
+		formatter := output.NewFormatter(output.FormatType(outputFormat))
+
+		for _, file := range args {
+			// Transcription with spinner
+			spinner := progress.NewSpinner(fmt.Sprintf("Transcribing %s (model: %s)...", file, modelSize))
+			spinner.Start()
+			result, err := whisperClient.Transcribe(file, whisperOptions)
+			if err != nil {
+				spinner.StopWithMessage(fmt.Sprintf("✗ Error transcribing %s: %v", file, err))
+				continue
+			}
+			spinner.StopWithMessage(fmt.Sprintf("✓ Transcribed %s (%.1fs audio)", file, result.Duration))
+
+			// Run diarization if enabled
+			if diarize {
+				spinner := progress.NewSpinner("Detecting speakers...")
+				spinner.Start()
+				diarizationResult, err := diarizationClient.Diarize(file, diarizationOptions)
+				if err != nil {
+					spinner.StopWithMessage(fmt.Sprintf("✗ Diarization failed: %v", err))
+				} else {
+					spinner.StopWithMessage(fmt.Sprintf("✓ Detected %d speaker(s)", diarizationResult.NumSpeakers))
+					diarization.AlignSpeakers(result, diarizationResult)
+				}
+			}
+
+			// Format output
+			formattedOutput, err := formatter.Format(result)
+			if err != nil {
+				fmt.Printf("Error formatting output: %v\n", err)
+				continue
+			}
+
+			// Write to file or stdout
+			if outputFile != "" {
+				err := os.WriteFile(outputFile, []byte(formattedOutput), 0644)
+				if err != nil {
+					fmt.Printf("✗ Error writing output file: %v\n", err)
+				} else {
+					fmt.Printf("✓ Saved to %s\n", outputFile)
+				}
+			} else {
+				fmt.Printf("\n%s\n", formattedOutput)
+			}
+		}
+	},
+}
+
+func init() {
+	rootCmd.Version = Version
+	rootCmd.PersistentFlags().StringVarP(&outputFile, "output", "o", "", "Output file path (required)")
+	rootCmd.PersistentFlags().StringVarP(&outputFormat, "format", "f", "srt", "Output format: text, srt, json")
+	rootCmd.PersistentFlags().BoolVar(&diarize, "diarize", false, "Enable speaker diarization")
+	rootCmd.PersistentFlags().IntVarP(&numSpeakers, "speakers", "s", 0, "Number of speakers (0 = auto-detect)")
+	rootCmd.PersistentFlags().StringVarP(&modelSize, "model", "m", "tiny", "Whisper model: tiny, base, small, medium, large, turbo")
+	rootCmd.PersistentFlags().BoolVar(&noWrite, "no-write", false, "Print output to stdout instead of file")
+}
+
+// Execute adds all child commands to the root command and sets flags appropriately.
+func Execute() {
+	if err := rootCmd.Execute(); err != nil {
+		fmt.Println(err)
+		os.Exit(1)
+	}
+}
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,28 @@
+module transcribe
+
+go 1.25.4
+
+require (
+	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
+	github.com/charmbracelet/bubbletea v1.3.10 // indirect
+	github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc // indirect
+	github.com/charmbracelet/lipgloss v1.1.0 // indirect
+	github.com/charmbracelet/x/ansi v0.10.1 // indirect
+	github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd // indirect
+	github.com/charmbracelet/x/term v0.2.1 // indirect
+	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
+	github.com/inconshreveable/mousetrap v1.1.0 // indirect
+	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
+	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/mattn/go-localereader v0.0.1 // indirect
+	github.com/mattn/go-runewidth v0.0.16 // indirect
+	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
+	github.com/muesli/cancelreader v0.2.2 // indirect
+	github.com/muesli/termenv v0.16.0 // indirect
+	github.com/rivo/uniseg v0.4.7 // indirect
+	github.com/spf13/cobra v1.10.2 // indirect
+	github.com/spf13/pflag v1.0.9 // indirect
+	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
+	golang.org/x/sys v0.36.0 // indirect
+	golang.org/x/text v0.3.8 // indirect
+)
--- a/go.sum
+++ b/go.sum
@@ -0,0 +1,51 @@
+github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
+github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
+github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
+github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
+github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc h1:4pZI35227imm7yK2bGPcfpFEmuY1gc2YSTShr4iJBfs=
+github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc/go.mod h1:X4/0JoqgTIPSFcRA/P6INZzIuyqdFY5rm8tb41s9okk=
+github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
+github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
+github.com/charmbracelet/x/ansi v0.10.1 h1:rL3Koar5XvX0pHGfovN03f5cxLbCF2YvLeyz7D2jVDQ=
+github.com/charmbracelet/x/ansi v0.10.1/go.mod h1:3RQDQ6lDnROptfpWuUVIUG64bD2g2BgntdxH0Ya5TeE=
+github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd h1:vy0GVL4jeHEwG5YOXDmi86oYw2yuYUGqz6a8sLwg0X8=
+github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd/go.mod h1:xe0nKWGd3eJgtqZRaN9RjMtK7xUYchjzPr7q6kcvCCs=
+github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
+github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
+github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
+github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
+github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
+github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
+github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
+github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
+github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
+github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
+github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
+github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
+github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
+github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
+github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
+github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
+github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
+github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
+github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU=
+github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4=
+github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
+github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
+github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
+go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
+golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
+golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY=
+golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
--- a/install.sh
+++ b/install.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INSTALL_DIR="$HOME/.local/bin"
+
+cd "$SCRIPT_DIR"
+
+VERSION="$(cat "$SCRIPT_DIR/VERSION")"
+
+echo "Building transcribe (version: $VERSION)..."
+go build -ldflags "-X transcribe/cmd.Version=$VERSION" -o transcribe .
+
+echo "Installing to $INSTALL_DIR..."
+mkdir -p "$INSTALL_DIR"
+cp transcribe "$INSTALL_DIR/"
+chmod +x "$INSTALL_DIR/transcribe"
+
+if [[ ":$PATH:" != *":$HOME/.local/bin:"* ]]; then
+    echo ""
+    echo "Warning: ~/.local/bin is not in your PATH"
+    echo "Add this to your shell rc file (e.g., ~/.bashrc or ~/.zshrc):"
+    echo '  export PATH="$HOME/.local/bin:$PATH"'
+fi
+
+echo ""
+echo "Installed successfully!"
--- a/internal/diarization/align.go
+++ b/internal/diarization/align.go
@@ -0,0 +1,59 @@
+package diarization
+
+import (
+	"transcribe/internal/whisper"
+)
+
+// AlignSpeakers maps speaker segments to transcription segments by timestamp overlap
+func AlignSpeakers(transcription *whisper.TranscriptionResult, diarization *DiarizationResult) {
+	if diarization == nil || len(diarization.Speakers) == 0 {
+		return
+	}
+
+	for i := range transcription.Segments {
+		seg := &transcription.Segments[i]
+		speaker := findSpeakerForSegment(seg.Start, seg.End, diarization.Speakers)
+		seg.Speaker = speaker
+	}
+}
+
+// findSpeakerForSegment finds the speaker with the most overlap with the given time range
+func findSpeakerForSegment(start, end float64, speakers []SpeakerSegment) string {
+	var bestSpeaker string
+	var maxOverlap float64
+
+	for _, spk := range speakers {
+		overlap := calculateOverlap(start, end, spk.Start, spk.End)
+		if overlap > maxOverlap {
+			maxOverlap = overlap
+			bestSpeaker = spk.Speaker
+		}
+	}
+
+	return bestSpeaker
+}
+
+// calculateOverlap returns the duration of overlap between two time ranges
+func calculateOverlap(start1, end1, start2, end2 float64) float64 {
+	overlapStart := max(start1, start2)
+	overlapEnd := min(end1, end2)
+
+	if overlapEnd > overlapStart {
+		return overlapEnd - overlapStart
+	}
+	return 0
+}
+
+func max(a, b float64) float64 {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+func min(a, b float64) float64 {
+	if a < b {
+		return a
+	}
+	return b
+}
--- a/internal/diarization/client.go
+++ b/internal/diarization/client.go
@@ -0,0 +1,222 @@
+package diarization
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+)
+
+// SpeakerSegment represents a segment with speaker identification
+type SpeakerSegment struct {
+	Speaker string  `json:"speaker"` // "Speaker 1", "Speaker 2", etc.
+	Start   float64 `json:"start"`
+	End     float64 `json:"end"`
+}
+
+// DiarizationResult contains the speaker diarization output
+type DiarizationResult struct {
+	Speakers    []SpeakerSegment `json:"speakers"`
+	NumSpeakers int              `json:"num_speakers"`
+}
+
+// Client handles speaker diarization using resemblyzer
+type Client struct{}
+
+// NewClient creates a new diarization client
+func NewClient() *Client {
+	return &Client{}
+}
+
+// DiarizationOptions contains options for diarization
+type DiarizationOptions struct {
+	NumSpeakers int // Number of speakers (0 = auto-detect)
+}
+
+// DefaultDiarizationOptions returns default diarization options
+func DefaultDiarizationOptions() *DiarizationOptions {
+	return &DiarizationOptions{
+		NumSpeakers: 0, // Auto-detect
+	}
+}
+
+// Diarize processes an audio file and returns speaker segments
+func (c *Client) Diarize(audioPath string, options *DiarizationOptions) (*DiarizationResult, error) {
+	if options == nil {
+		options = DefaultDiarizationOptions()
+	}
+
+	// Build the Python command
+	cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options))
+
+	// Capture stdout and stderr
+	var out bytes.Buffer
+	var errBuf bytes.Buffer
+	cmd.Stdout = &out
+	cmd.Stderr = &errBuf
+
+	// Execute the command
+	err := cmd.Run()
+	if err != nil {
+		return nil, fmt.Errorf("diarization failed: %v, stderr: %s", err, errBuf.String())
+	}
+
+	// Parse the JSON output
+	var result DiarizationResult
+	err = json.Unmarshal(out.Bytes(), &result)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse diarization output: %v, output: %s", err, out.String())
+	}
+
+	return &result, nil
+}
+
+// buildPythonCommand constructs the Python command for diarization
+func (c *Client) buildPythonCommand(audioPath string, options *DiarizationOptions) string {
+	numSpeakersStr := "None"
+	if options.NumSpeakers > 0 {
+		numSpeakersStr = fmt.Sprintf("%d", options.NumSpeakers)
+	}
+
+	pythonCode := fmt.Sprintf(`
+import json
+import sys
+import os
+import warnings
+import numpy as np
+
+# Suppress warnings
+warnings.filterwarnings("ignore")
+
+# Redirect both stdout and stderr during imports to suppress library noise
+old_stdout = sys.stdout
+old_stderr = sys.stderr
+sys.stdout = open(os.devnull, 'w')
+sys.stderr = open(os.devnull, 'w')
+
+from resemblyzer import VoiceEncoder, preprocess_wav
+from sklearn.cluster import SpectralClustering, AgglomerativeClustering
+import librosa
+
+# Initialize voice encoder while stdout is suppressed (it prints loading message)
+encoder = VoiceEncoder()
+
+# Restore stdout/stderr
+sys.stdout = old_stdout
+sys.stderr = old_stderr
+
+# Configuration
+AUDIO_PATH = "%s"
+NUM_SPEAKERS = %s
+SEGMENT_DURATION = 1.5  # seconds per segment for embedding extraction
+HOP_DURATION = 0.75     # hop between segments
+
+# Load audio
+audio, sr = librosa.load(AUDIO_PATH, sr=16000)
+duration = len(audio) / sr
+
+# Extract embeddings for overlapping segments
+embeddings = []
+timestamps = []
+current_time = 0.0
+
+while current_time + SEGMENT_DURATION <= duration:
+    start_sample = int(current_time * sr)
+    end_sample = int((current_time + SEGMENT_DURATION) * sr)
+    segment = audio[start_sample:end_sample]
+
+    # Skip silent segments
+    if np.abs(segment).mean() > 0.01:
+        try:
+            wav = preprocess_wav(segment, source_sr=sr)
+            if len(wav) > 0:
+                embedding = encoder.embed_utterance(wav)
+                embeddings.append(embedding)
+                timestamps.append((current_time, current_time + SEGMENT_DURATION))
+        except:
+            pass
+
+    current_time += HOP_DURATION
+
+# Handle edge cases
+if len(embeddings) == 0:
+    print(json.dumps({"speakers": [], "num_speakers": 0}))
+    sys.exit(0)
+
+embeddings = np.array(embeddings)
+
+# Determine number of speakers
+if NUM_SPEAKERS is None or NUM_SPEAKERS <= 0:
+    # Auto-detect using silhouette score
+    from sklearn.metrics import silhouette_score
+    best_n = 2
+    best_score = -1
+    for n in range(2, min(6, len(embeddings))):
+        try:
+            clustering = AgglomerativeClustering(n_clusters=n)
+            labels = clustering.fit_predict(embeddings)
+            score = silhouette_score(embeddings, labels)
+            if score > best_score:
+                best_score = score
+                best_n = n
+        except:
+            pass
+    num_speakers = best_n
+else:
+    num_speakers = NUM_SPEAKERS
+
+# Cluster embeddings
+try:
+    if len(embeddings) >= num_speakers:
+        clustering = AgglomerativeClustering(n_clusters=num_speakers)
+        labels = clustering.fit_predict(embeddings)
+    else:
+        labels = list(range(len(embeddings)))
+        num_speakers = len(embeddings)
+except Exception as e:
+    labels = [0] * len(embeddings)
+    num_speakers = 1
+
+# Build speaker segments with merging of consecutive same-speaker segments
+speaker_segments = []
+prev_speaker = None
+prev_start = None
+prev_end = None
+
+for i, (start, end) in enumerate(timestamps):
+    speaker = f"Speaker {labels[i] + 1}"
+
+    if speaker == prev_speaker and prev_end is not None:
+        # Extend previous segment if same speaker and close in time
+        if start - prev_end < 0.5:
+            prev_end = end
+            continue
+
+    # Save previous segment
+    if prev_speaker is not None:
+        speaker_segments.append({
+            "speaker": prev_speaker,
+            "start": prev_start,
+            "end": prev_end
+        })
+
+    prev_speaker = speaker
+    prev_start = start
+    prev_end = end
+
+# Don't forget the last segment
+if prev_speaker is not None:
+    speaker_segments.append({
+        "speaker": prev_speaker,
+        "start": prev_start,
+        "end": prev_end
+    })
+
+print(json.dumps({
+    "speakers": speaker_segments,
+    "num_speakers": num_speakers
+}))
+`, audioPath, numSpeakersStr)
+
+	return pythonCode
+}
--- a/internal/whisper/client.go
+++ b/internal/whisper/client.go
@@ -0,0 +1,162 @@
+package whisper
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+)
+
+// ModelSize represents the different Whisper model sizes
+type ModelSize string
+
+const (
+	ModelTiny   ModelSize = "tiny"
+	ModelBase   ModelSize = "base"
+	ModelSmall  ModelSize = "small"
+	ModelMedium ModelSize = "medium"
+	ModelLarge  ModelSize = "large"
+	ModelTurbo  ModelSize = "turbo"
+)
+
+// TranscriptionResult contains the transcription output
+type TranscriptionResult struct {
+	Text     string    `json:"text"`
+	Segments []Segment `json:"segments"`
+	Language string    `json:"language"`
+	Duration float64   `json:"duration"`
+}
+
+// Segment represents a segment of transcription with timestamps
+type Segment struct {
+	Start   float64 `json:"start"`
+	End     float64 `json:"end"`
+	Text    string  `json:"text"`
+	Words   []Word  `json:"words,omitempty"`
+	Speaker string  `json:"speaker,omitempty"`
+}
+
+// Word represents a word with timestamp
+type Word struct {
+	Start float64 `json:"start"`
+	End   float64 `json:"end"`
+	Word  string  `json:"word"`
+}
+
+// Client is the Whisper client that handles transcription
+type Client struct {
+	ModelPath string
+	ModelSize ModelSize
+}
+
+// NewClient creates a new Whisper client
+func NewClient(modelSize ModelSize) *Client {
+	return &Client{
+		ModelSize: modelSize,
+	}
+}
+
+// Transcribe processes an audio file and returns transcription
+func (c *Client) Transcribe(audioPath string, options *TranscriptionOptions) (*TranscriptionResult, error) {
+	if options == nil {
+		options = &TranscriptionOptions{}
+	}
+
+	// Build the Python command
+	cmd := exec.Command("python3", "-c", c.buildPythonCommand(audioPath, options))
+
+	// Capture stdout and stderr
+	var out bytes.Buffer
+	var errBuf bytes.Buffer
+	cmd.Stdout = &out
+	cmd.Stderr = &errBuf
+
+	// Execute the command
+	err := cmd.Run()
+	if err != nil {
+		return nil, fmt.Errorf("transcription failed: %v, stderr: %s", err, errBuf.String())
+	}
+
+	// Parse the JSON output
+	var result TranscriptionResult
+	err = json.Unmarshal(out.Bytes(), &result)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse transcription output: %v", err)
+	}
+
+	return &result, nil
+}
+
+// buildPythonCommand constructs the Python command for Whisper
+func (c *Client) buildPythonCommand(audioPath string, options *TranscriptionOptions) string {
+	// Convert Go bool to Python bool string
+	verboseStr := "False"
+	if options.Verbose {
+		verboseStr = "True"
+	}
+
+	// Handle language option
+	langStr := "None"
+	if options.Language != "" && options.Language != "auto" {
+		langStr = fmt.Sprintf(`"%s"`, options.Language)
+	}
+
+	pythonCode := fmt.Sprintf(`
+import whisper
+import json
+import sys
+import os
+import warnings
+
+# Suppress warnings and stdout during transcription
+warnings.filterwarnings("ignore")
+old_stdout = sys.stdout
+sys.stdout = open(os.devnull, 'w')
+
+# Load model
+model = whisper.load_model("%s")
+
+# Transcribe
+result = model.transcribe("%s",
+    language=%s,
+    verbose=%s,
+    temperature=%.1f,
+    best_of=%d)
+
+# Restore stdout for JSON output
+sys.stdout = old_stdout
+
+# Output as JSON
+print(json.dumps({
+    "text": result["text"],
+    "language": result.get("language", ""),
+    "duration": result.get("duration", 0.0),
+    "segments": [{
+        "start": seg["start"],
+        "end": seg["end"],
+        "text": seg["text"],
+        "words": seg.get("words", [])
+    } for seg in result.get("segments", [])]
+}))
+`, c.ModelSize, audioPath, langStr, verboseStr, options.Temperature, options.BestOf)
+
+	return pythonCode
+}
+
+// TranscriptionOptions contains options for transcription
+type TranscriptionOptions struct {
+	Language    string  // Language code or "auto"
+	Verbose     bool    // Show progress bar
+	Temperature float64 // Temperature for sampling (higher = more creative)
+	BestOf      int     // Number of candidates when sampling with temperature > 0
+}
+
+// DefaultTranscriptionOptions returns default transcription options
+func DefaultTranscriptionOptions() *TranscriptionOptions {
+	return &TranscriptionOptions{
+		Language:    "auto",
+		Verbose:     false,
+		Temperature: 0.0,
+		BestOf:      5,
+	}
+}
--- a/main.go
+++ b/main.go
@@ -0,0 +1,9 @@
+package main
+
+import (
+	"transcribe/cmd"
+)
+
+func main() {
+	cmd.Execute()
+}
--- a/pkg/audio/audio.go
+++ b/pkg/audio/audio.go
@@ -0,0 +1,56 @@
+package audio
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// SupportedAudioFormats lists the audio formats that can be processed
+type SupportedAudioFormats []string
+
+var DefaultSupportedFormats = SupportedAudioFormats{
+	".mp3",
+	".wav",
+	".flac",
+	".m4a",
+	".ogg",
+	".opus",
+}
+
+// IsSupported checks if a file has a supported audio format
+type AudioFile struct {
+	Path   string
+	Format string
+	Size   int64
+}
+
+func NewAudioFile(path string) (*AudioFile, error) {
+	fileInfo, err := os.Stat(path)
+	if err != nil {
+		return nil, err
+	}
+
+	ext := filepath.Ext(path)
+	if !IsSupported(ext) {
+		return nil, errors.New("unsupported audio format: " + ext)
+	}
+
+	return &AudioFile{
+		Path:   path,
+		Format: ext,
+		Size:   fileInfo.Size(),
+	}, nil
+}
+
+// IsSupported checks if the given extension is in supported formats
+func IsSupported(ext string) bool {
+	ext = strings.ToLower(ext)
+	for _, format := range DefaultSupportedFormats {
+		if ext == format {
+			return true
+		}
+	}
+	return false
+}
--- a/pkg/output/formatter.go
+++ b/pkg/output/formatter.go
@@ -0,0 +1,33 @@
+package output
+
+import (
+	"transcribe/internal/whisper"
+)
+
+// Formatter interface for converting transcription results to various output formats
+type Formatter interface {
+	Format(result *whisper.TranscriptionResult) (string, error)
+}
+
+// FormatType represents the output format type
+type FormatType string
+
+const (
+	FormatText FormatType = "text"
+	FormatSRT  FormatType = "srt"
+	FormatJSON FormatType = "json"
+)
+
+// NewFormatter creates a formatter for the given format type
+func NewFormatter(format FormatType) Formatter {
+	switch format {
+	case FormatSRT:
+		return &SRTFormatter{}
+	case FormatJSON:
+		return &JSONFormatter{}
+	case FormatText:
+		fallthrough
+	default:
+		return &TextFormatter{}
+	}
+}
--- a/pkg/output/json.go
+++ b/pkg/output/json.go
@@ -0,0 +1,19 @@
+package output
+
+import (
+	"encoding/json"
+
+	"transcribe/internal/whisper"
+)
+
+// JSONFormatter formats transcription results as JSON
+type JSONFormatter struct{}
+
+// Format converts transcription result to JSON format
+func (f *JSONFormatter) Format(result *whisper.TranscriptionResult) (string, error) {
+	data, err := json.MarshalIndent(result, "", "  ")
+	if err != nil {
+		return "", err
+	}
+	return string(data), nil
+}
--- a/pkg/output/srt.go
+++ b/pkg/output/srt.go
@@ -0,0 +1,49 @@
+package output
+
+import (
+	"fmt"
+	"strings"
+
+	"transcribe/internal/whisper"
+)
+
+// SRTFormatter formats transcription results as SRT subtitles
+type SRTFormatter struct{}
+
+// Format converts transcription result to SRT format
+func (f *SRTFormatter) Format(result *whisper.TranscriptionResult) (string, error) {
+	var builder strings.Builder
+
+	for i, seg := range result.Segments {
+		// Subtitle number (1-indexed)
+		builder.WriteString(fmt.Sprintf("%d\n", i+1))
+
+		// Timestamps in SRT format: HH:MM:SS,mmm --> HH:MM:SS,mmm
+		startTime := formatSRTTimestamp(seg.Start)
+		endTime := formatSRTTimestamp(seg.End)
+		builder.WriteString(fmt.Sprintf("%s --> %s\n", startTime, endTime))
+
+		// Text with optional speaker label
+		text := strings.TrimSpace(seg.Text)
+		if seg.Speaker != "" {
+			text = fmt.Sprintf("[%s] %s", seg.Speaker, text)
+		}
+		builder.WriteString(text)
+		builder.WriteString("\n\n")
+	}
+
+	return strings.TrimSuffix(builder.String(), "\n"), nil
+}
+
+// formatSRTTimestamp converts seconds to SRT timestamp format (HH:MM:SS,mmm)
+func formatSRTTimestamp(seconds float64) string {
+	totalMs := int64(seconds * 1000)
+	ms := totalMs % 1000
+	totalSeconds := totalMs / 1000
+	s := totalSeconds % 60
+	totalMinutes := totalSeconds / 60
+	m := totalMinutes % 60
+	h := totalMinutes / 60
+
+	return fmt.Sprintf("%02d:%02d:%02d,%03d", h, m, s, ms)
+}
--- a/pkg/output/text.go
+++ b/pkg/output/text.go
@@ -0,0 +1,41 @@
+package output
+
+import (
+	"fmt"
+	"strings"
+
+	"transcribe/internal/whisper"
+)
+
+// TextFormatter formats transcription results as plain text with timestamps
+type TextFormatter struct{}
+
+// Format converts transcription result to plain text with timestamps
+func (f *TextFormatter) Format(result *whisper.TranscriptionResult) (string, error) {
+	var builder strings.Builder
+
+	for _, seg := range result.Segments {
+		// Format: [MM:SS - MM:SS] [Speaker] Text
+		startTime := formatTextTimestamp(seg.Start)
+		endTime := formatTextTimestamp(seg.End)
+
+		text := strings.TrimSpace(seg.Text)
+		if seg.Speaker != "" {
+			builder.WriteString(fmt.Sprintf("[%s - %s] [%s] %s\n", startTime, endTime, seg.Speaker, text))
+		} else {
+			builder.WriteString(fmt.Sprintf("[%s - %s] %s\n", startTime, endTime, text))
+		}
+	}
+
+	return strings.TrimSuffix(builder.String(), "\n"), nil
+}
+
+// formatTextTimestamp converts seconds to MM:SS.s format
+func formatTextTimestamp(seconds float64) string {
+	totalSeconds := int(seconds)
+	m := totalSeconds / 60
+	s := totalSeconds % 60
+	tenths := int((seconds - float64(totalSeconds)) * 10)
+
+	return fmt.Sprintf("%02d:%02d.%d", m, s, tenths)
+}
--- a/pkg/progress/spinner.go
+++ b/pkg/progress/spinner.go
@@ -0,0 +1,84 @@
+package progress
+
+import (
+	"fmt"
+	"sync"
+	"time"
+)
+
+// Spinner displays an animated spinner with a message
+type Spinner struct {
+	message  string
+	frames   []string
+	interval time.Duration
+	stop     chan struct{}
+	done     chan struct{}
+	mu       sync.Mutex
+	running  bool
+}
+
+// NewSpinner creates a new spinner with the given message
+func NewSpinner(message string) *Spinner {
+	return &Spinner{
+		message:  message,
+		frames:   []string{"⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"},
+		interval: 80 * time.Millisecond,
+		stop:     make(chan struct{}),
+		done:     make(chan struct{}),
+	}
+}
+
+// Start begins the spinner animation
+func (s *Spinner) Start() {
+	s.mu.Lock()
+	if s.running {
+		s.mu.Unlock()
+		return
+	}
+	s.running = true
+	s.mu.Unlock()
+
+	go func() {
+		i := 0
+		for {
+			select {
+			case <-s.stop:
+				// Clear the line and signal done
+				fmt.Print("\r\033[K")
+				close(s.done)
+				return
+			default:
+				fmt.Printf("\r%s %s", s.frames[i%len(s.frames)], s.message)
+				i++
+				time.Sleep(s.interval)
+			}
+		}
+	}()
+}
+
+// Stop stops the spinner and clears the line
+func (s *Spinner) Stop() {
+	s.mu.Lock()
+	if !s.running {
+		s.mu.Unlock()
+		return
+	}
+	s.running = false
+	s.mu.Unlock()
+
+	close(s.stop)
+	<-s.done
+}
+
+// StopWithMessage stops the spinner and prints a final message
+func (s *Spinner) StopWithMessage(message string) {
+	s.Stop()
+	fmt.Println(message)
+}
+
+// UpdateMessage updates the spinner message while running
+func (s *Spinner) UpdateMessage(message string) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.message = message
+}