Skip to content

API Reference

easytranscribe.speech_to_text.capture_and_transcribe(model_name='turbo', verbose=False)

Captures audio from microphone and transcribes to text using Whisper. Waits for user to start speaking, then stops after 3 seconds of silence.

Parameters:

Name Type Description Default
model_name str

Whisper model to use for transcription

'turbo'
verbose bool

If True, shows detailed recording and processing information

False
Source code in easytranscribe/speech_to_text.py
def capture_and_transcribe(model_name: str = "turbo", verbose: bool = False) -> str:
    """
    Captures audio from microphone and transcribes to text using Whisper.
    Waits for user to start speaking, then stops after 3 seconds of silence.

    Args:
        model_name: Whisper model to use for transcription
        verbose: If True, shows detailed recording and processing information
    """
    if verbose:
        print("Speak into the microphone...")

    # Audio settings
    samplerate = 16000
    chunk_duration = 0.5  # seconds per chunk
    chunk_size = int(samplerate * chunk_duration)
    silence_threshold = 0.01  # Audio level threshold for silence detection
    silence_duration = 3.0  # seconds of silence before stopping
    min_recording_time = 2.0  # minimum recording time after speech starts

    if verbose:
        print(f"Listening... (will stop after {silence_duration} seconds of silence)")

    # Recording state
    audio_data = []
    started_speaking = False
    recording_start_time: float | None = None
    last_speech_time: float | None = None

    try:
        with sd.InputStream(samplerate=samplerate, channels=1, dtype=np.float32):
            while True:
                # Record a chunk
                chunk = sd.rec(
                    chunk_size, samplerate=samplerate, channels=1, dtype=np.float32
                )
                sd.wait()  # Wait for recording to complete
                audio_data.append(chunk.flatten())

                # Calculate audio level
                audio_level = np.abs(chunk).mean()
                is_silent = audio_level < silence_threshold
                current_time = time.time()

                # Check if user started speaking
                if not started_speaking:
                    if not is_silent:
                        started_speaking = True
                        recording_start_time = current_time
                        last_speech_time = current_time
                        if verbose:
                            print("Started speaking... Recording now.")
                    continue

                # Update last speech time if not silent
                if not is_silent:
                    last_speech_time = current_time

                # Check minimum recording time
                if (
                    recording_start_time is not None
                    and isinstance(recording_start_time, float)
                    and current_time - recording_start_time < min_recording_time
                ):
                    continue

                if (
                    last_speech_time is not None
                    and isinstance(last_speech_time, float)
                    and (current_time - last_speech_time) >= silence_duration
                ):
                    silence_time = current_time - last_speech_time
                    if verbose:
                        print(
                            f"Detected {silence_time:.1f} seconds of silence. Stopping recording."
                        )
                    break

                # Debug output every few seconds
                if recording_start_time is not None and verbose:
                    recording_time = current_time - recording_start_time
                    if (
                        int(recording_time) % 3 == 0
                        and recording_time - int(recording_time) < chunk_duration
                    ):
                        silence_time = (
                            current_time - last_speech_time if last_speech_time else 0
                        )
                        print(
                            f"Recording... {recording_time:.1f}s (silence: {silence_time:.1f}s, level: {audio_level:.4f})"
                        )

    except KeyboardInterrupt:
        if verbose:
            print("\nRecording interrupted by user.")

    if not audio_data:
        if verbose:
            print("No audio recorded.")
        return ""

    # Calculate audio duration
    audio = np.concatenate(audio_data)
    audio_duration = len(audio) / samplerate
    if verbose:
        print(f"Recording complete. Recorded {audio_duration:.1f} seconds of audio.")
        print("Transcribing...")

    # Transcribe with Whisper and measure processing time
    transcription_start_time = time.time()
    model = whisper.load_model(model_name)
    result = model.transcribe(audio, fp16=False)
    transcribed_text = result["text"].strip()
    processing_time = time.time() - transcription_start_time

    if verbose:
        print("Transcribed text:", transcribed_text)

    # Log the transcription
    try:
        log_transcription(
            model_name=model_name,
            transcribed_text=transcribed_text,
            audio_duration=audio_duration,
            processing_time=processing_time,
        )
        if verbose:
            print(
                f"Transcription logged successfully (Duration: {audio_duration:.1f}s, Processing: {processing_time:.1f}s)"
            )
    except Exception as e:
        if verbose:
            print(f"Warning: Failed to log transcription: {e}")

    return transcribed_text

easytranscribe.speech_to_text.transcribe_audio_file(filepath, model_name='turbo', verbose=False)

Transcribes an audio file to text using Whisper.

Parameters:

Name Type Description Default
filepath str

Path to the audio file

required
model_name str

Whisper model to use for transcription

'turbo'
verbose bool

If True, shows detailed processing information

False

Returns:

Type Description
str

Transcribed text from the audio file

Raises:

Type Description
FileNotFoundError

If the audio file doesn't exist

Exception

If transcription fails

Source code in easytranscribe/speech_to_text.py
def transcribe_audio_file(
    filepath: str, model_name: str = "turbo", verbose: bool = False
) -> str:
    """
    Transcribes an audio file to text using Whisper.

    Args:
        filepath: Path to the audio file
        model_name: Whisper model to use for transcription
        verbose: If True, shows detailed processing information

    Returns:
        Transcribed text from the audio file

    Raises:
        FileNotFoundError: If the audio file doesn't exist
        Exception: If transcription fails
    """
    import os

    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Audio file not found: {filepath}")

    if verbose:
        print(f"Transcribing file: {filepath}")
    transcription_start_time = time.time()

    try:
        model = whisper.load_model(model_name)
        result = model.transcribe(filepath, fp16=False)
        transcribed_text = result["text"].strip()
        processing_time = time.time() - transcription_start_time

        if verbose:
            print("Transcribed text:", transcribed_text)

        # Log the transcription
        try:
            log_transcription(
                model_name=model_name,
                transcribed_text=transcribed_text,
                audio_duration=None,
                processing_time=processing_time,
                audio_file=filepath,
            )
            if verbose:
                print(
                    f"Transcription logged successfully (File: {filepath}, Processing: {processing_time:.1f}s)"
                )
        except Exception as e:
            if verbose:
                print(f"Warning: Failed to log transcription: {e}")

        return transcribed_text

    except Exception as e:
        if verbose:
            print(f"Error during transcription: {e}")
        raise

easytranscribe.view_logs.view_logs(date=None, tail=None, stats=False)

View transcription logs with various options.

Parameters:

Name Type Description Default
date Optional[str]

Date in YYYY-MM-DD format, or 'today' for today's logs

None
tail Optional[int]

Show last N entries

None
stats bool

Show statistics summary

False

Returns:

Type Description
Dict[str, Any]

Dictionary containing log data and statistics

Source code in easytranscribe/view_logs.py
def view_logs(
    date: Optional[str] = None, tail: Optional[int] = None, stats: bool = False
) -> Dict[str, Any]:
    """
    View transcription logs with various options.

    Args:
        date: Date in YYYY-MM-DD format, or 'today' for today's logs
        tail: Show last N entries
        stats: Show statistics summary

    Returns:
        Dictionary containing log data and statistics
    """
    logs_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")

    if not os.path.exists(logs_dir):
        return {"error": "No logs directory found"}

    # Determine which log files to read
    if date == "today" or date is None:
        target_date = datetime.now().strftime("%Y-%m-%d")
        log_pattern = f"transcription_{target_date}.log"
    elif date:
        log_pattern = f"transcription_{date}.log"
    else:
        log_pattern = "transcription_*.log"

    log_files = glob.glob(os.path.join(logs_dir, log_pattern))

    if not log_files:
        return {"error": f"No log files found for pattern: {log_pattern}"}

    # Read and parse log entries
    entries = []
    for log_file in sorted(log_files):
        with open(log_file, "r", encoding="utf-8") as f:
            content = f.read()
            # Parse log entries (assuming our simple format)
            blocks = content.split("-" * 40)
            for block in blocks:
                if block.strip():
                    entries.append(block.strip())

    # Apply tail filter
    if tail and tail > 0:
        entries = entries[-tail:]

    result: Dict[str, Any] = {"entries": entries, "total_count": len(entries)}

    # Calculate statistics if requested
    if stats:
        model_counts: Dict[str, int] = {}
        total_duration = 0.0
        total_processing = 0.0

        for entry in entries:
            lines = entry.split("\n")
            for line in lines:
                if line.startswith("Model:"):
                    model = line.split(":", 1)[1].strip()
                    model_counts[model] = model_counts.get(model, 0) + 1
                elif line.startswith("Audio Duration:"):
                    try:
                        duration = float(line.split(":")[1].strip().replace("s", ""))
                        total_duration += duration
                    except (ValueError, IndexError):
                        pass
                elif line.startswith("Processing Time:"):
                    try:
                        processing = float(line.split(":")[1].strip().replace("s", ""))
                        total_processing += processing
                    except (ValueError, IndexError):
                        pass

        stats_dict: Dict[str, Any] = {
            "model_usage": model_counts,
            "total_audio_duration": total_duration,
            "total_processing_time": total_processing,
            "average_processing_time": (
                total_processing / len(entries) if entries else 0.0
            ),
        }
        result["statistics"] = stats_dict

    return result

easytranscribe.view_logs.get_available_log_dates()

Get list of available log dates.

Returns:

Type Description
List[str]

List of date strings in YYYY-MM-DD format

Source code in easytranscribe/view_logs.py
def get_available_log_dates() -> List[str]:
    """
    Get list of available log dates.

    Returns:
        List of date strings in YYYY-MM-DD format
    """
    logs_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")

    if not os.path.exists(logs_dir):
        return []

    log_files = glob.glob(os.path.join(logs_dir, "transcription_*.log"))
    dates = []

    for log_file in log_files:
        filename = os.path.basename(log_file)
        # Extract date from filename: transcription_YYYY-MM-DD.log
        if filename.startswith("transcription_") and filename.endswith(".log"):
            date_part = filename[14:-4]  # Remove prefix and suffix
            if len(date_part) == 10:  # YYYY-MM-DD format
                dates.append(date_part)

    return sorted(dates)

CLI Module

easytranscribe.cli.main()

Main CLI entry point.

Source code in easytranscribe/cli.py
def main():
    """Main CLI entry point."""
    parser = argparse.ArgumentParser(
        description="Easy speech-to-text transcription using Whisper",
        prog="easytranscribe",
    )

    parser.add_argument(
        "--version", action="version", version=f"easytranscribe {__version__}"
    )

    subparsers = parser.add_subparsers(dest="command", help="Available commands")

    # Live transcription command
    live_parser = subparsers.add_parser("live", help="Transcribe from microphone")
    live_parser.add_argument(
        "--model",
        default="base",
        choices=["tiny", "base", "small", "medium", "large", "turbo"],
        help="Whisper model to use (default: base)",
    )
    live_parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable verbose output (shows recording status and processing info)",
    )

    # File transcription command
    file_parser = subparsers.add_parser("file", help="Transcribe from audio file")
    file_parser.add_argument("filepath", help="Path to audio file")
    file_parser.add_argument(
        "--model",
        default="base",
        choices=["tiny", "base", "small", "medium", "large", "turbo"],
        help="Whisper model to use (default: base)",
    )
    file_parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable verbose output (shows processing info)",
    )

    # Logs command
    logs_parser = subparsers.add_parser("logs", help="View transcription logs")
    logs_parser.add_argument("--date", help="Date in YYYY-MM-DD format or 'today'")
    logs_parser.add_argument("--tail", type=int, help="Show last N entries")
    logs_parser.add_argument("--stats", action="store_true", help="Show statistics")
    logs_parser.add_argument(
        "--list-dates", action="store_true", help="List available log dates"
    )

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return 1

    try:
        if args.command == "live":
            if args.verbose:
                print(f"🎤 Starting live transcription with {args.model} model...")
            text = capture_and_transcribe(model_name=args.model, verbose=args.verbose)
            print(f"📝 Transcribed: {text}")

        elif args.command == "file":
            if not Path(args.filepath).exists():
                print(f"❌ Error: File not found: {args.filepath}")
                return 1

            if args.verbose:
                print(f"📁 Transcribing file: {args.filepath}")
            text = transcribe_audio_file(
                args.filepath, model_name=args.model, verbose=args.verbose
            )
            print(f"📝 Transcribed: {text}")

        elif args.command == "logs":
            if args.list_dates:
                dates = get_available_log_dates()
                if dates:
                    print("📅 Available log dates:")
                    for date in dates:
                        print(f"  - {date}")
                else:
                    print("📅 No log files found")
                return 0

            logs = view_logs(date=args.date, tail=args.tail, stats=args.stats)

            if "error" in logs:
                print(f"❌ {logs['error']}")
                return 1

            print(f"📋 Found {logs['total_count']} log entries")

            if args.stats and "statistics" in logs:
                stats = logs["statistics"]
                print("\n📊 Statistics:")
                print(f"  - Total audio duration: {stats['total_audio_duration']:.1f}s")
                print(
                    f"  - Total processing time: {stats['total_processing_time']:.1f}s"
                )
                print(
                    f"  - Average processing time: {stats['average_processing_time']:.1f}s"
                )
                print(f"  - Model usage: {stats['model_usage']}")

            if not args.stats:
                print("\nRecent entries:")
                for i, entry in enumerate(logs["entries"][-5:], 1):  # Show last 5
                    print(f"\n--- Entry {i} ---")
                    print(entry)

    except KeyboardInterrupt:
        print("\n👋 Operation cancelled by user")
        return 1
    except Exception as e:
        print(f"❌ Error: {e}")
        return 1

    return 0

Logging Module

easytranscribe.transcription_logger.log_transcription(model_name, transcribed_text, audio_duration=None, processing_time=None, audio_file=None)

Logs transcription details to a log file.

Parameters:

Name Type Description Default
model_name str

Name of the Whisper model used

required
transcribed_text str

The transcribed text

required
audio_duration Optional[float]

Duration of audio in seconds (for live recording)

None
processing_time Optional[float]

Time taken for processing in seconds

None
audio_file Optional[str]

Path to audio file (for file transcription)

None
Source code in easytranscribe/transcription_logger.py
def log_transcription(
    model_name: str,
    transcribed_text: str,
    audio_duration: Optional[float] = None,
    processing_time: Optional[float] = None,
    audio_file: Optional[str] = None,
) -> None:
    """
    Logs transcription details to a log file.

    Args:
        model_name: Name of the Whisper model used
        transcribed_text: The transcribed text
        audio_duration: Duration of audio in seconds (for live recording)
        processing_time: Time taken for processing in seconds
        audio_file: Path to audio file (for file transcription)
    """
    log_dir = os.path.join(os.path.dirname(__file__), "..", "logs")
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, f"transcription_{time.strftime('%Y-%m-%d')}.log")

    try:
        with open(log_file, "a", encoding="utf-8") as f:
            f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Model: {model_name}\n")
            if audio_file:
                f.write(f"Audio File: {audio_file}\n")
            if audio_duration is not None:
                f.write(f"Audio Duration: {audio_duration:.1f}s\n")
            if processing_time is not None:
                f.write(f"Processing Time: {processing_time:.1f}s\n")
            f.write(f"Transcribed Text: {transcribed_text}\n")
            f.write("-" * 40 + "\n")
    except Exception as e:
        print(f"Error writing to log file: {e}")
        raise

Version Information

easytranscribe._version.__version__ = '0.1.1' module-attribute