From bed25c51c760a02e5e393f81fdc70b9a52669483 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 14 Dec 2025 21:33:26 -0600 Subject: [PATCH] initial release - wyoming protocol server for chatterbox tts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit features: - voice cloning with 10-30s audio sample - gpu-accelerated inference - volume boost option - pip installable 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .gitignore | 24 +++++++ LICENSE | 21 ++++++ README.md | 98 +++++++++++++++++++++++++++ pyproject.toml | 41 ++++++++++++ wyoming_chatterbox/__init__.py | 3 + wyoming_chatterbox/__main__.py | 94 ++++++++++++++++++++++++++ wyoming_chatterbox/handler.py | 119 +++++++++++++++++++++++++++++++++ 7 files changed, 400 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 pyproject.toml create mode 100644 wyoming_chatterbox/__init__.py create mode 100644 wyoming_chatterbox/__main__.py create mode 100644 wyoming_chatterbox/handler.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..710b3cb --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +dist/ +*.egg-info/ +.eggs/ +venv/ +ENV/ + +# audio files +*.wav +*.mp3 + +# ide +.vscode/ +.idea/ + +# os +.DS_Store +Thumbs.db diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0d7dcd7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 sudoxnym + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..b51558c --- /dev/null +++ b/README.md @@ -0,0 +1,98 @@ +# wyoming-chatterbox + +[wyoming protocol](https://github.com/rhasspy/wyoming) server for [chatterbox tts](https://github.com/resemble-ai/chatterbox) with voice cloning. + +clone any voice with a 10-30 second audio sample. + +## requirements + +- nvidia gpu with 4gb+ vram +- cuda 12.x +- python 3.10+ + +## install + +```bash +pip install wyoming-chatterbox +``` + +or from source: +```bash +git clone https://github.com/sudoxnym/wyoming-chatterbox +cd wyoming-chatterbox +pip install . +``` + +## usage + +```bash +wyoming-chatterbox --uri tcp://0.0.0.0:10201 --voice-ref /path/to/voice_sample.wav +``` + +### options + +| option | default | description | +|--------|---------|-------------| +| `--uri` | required | server uri (e.g., `tcp://0.0.0.0:10201`) | +| `--voice-ref` | required | path to voice reference wav (10-30s of speech) | +| `--volume-boost` | 3.0 | output volume multiplier | +| `--device` | cuda | torch device (`cuda` or `cpu`) | +| `--debug` | false | enable debug logging | + +## voice reference tips + +for best results: +- 10-30 seconds of clean speech +- no background music or noise +- consistent speaking style +- wav format (any sample rate) + +## systemd service + +```bash +sudo tee /etc/systemd/system/wyoming-chatterbox.service << 'EOF' +[Unit] +Description=Wyoming Chatterbox TTS +After=network-online.target + +[Service] +Type=simple +User=YOUR_USER +Environment=PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +ExecStart=/path/to/venv/bin/wyoming-chatterbox \ + --uri tcp://0.0.0.0:10201 \ + --voice-ref /path/to/voice_reference.wav \ + --volume-boost 3.0 +Restart=always +RestartSec=5 + +[Install] +WantedBy=default.target +EOF + +sudo systemctl daemon-reload +sudo systemctl enable --now wyoming-chatterbox +``` + +## home assistant + +1. settings → devices & services → add integration +2. search "wyoming protocol" +3. host: `YOUR_IP`, port: `10201` +4. use in your voice assistant pipeline as tts + +## gpu memory + +chatterbox uses ~3.5gb vram. if you get oom errors: + +```bash +# check gpu usage +nvidia-smi + +# kill zombie processes +pkill -f wyoming-chatterbox +``` + +## license + +mit diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b7be5b8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "wyoming-chatterbox" +version = "1.0.0" +description = "Wyoming protocol server for Chatterbox TTS with voice cloning" +readme = "README.md" +license = {text = "MIT"} +authors = [ + {name = "sudoxnym"} +] +keywords = ["wyoming", "tts", "voice-cloning", "chatterbox", "home-assistant"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Multimedia :: Sound/Audio :: Speech", +] +requires-python = ">=3.10" +dependencies = [ + "wyoming>=1.5.0", + "chatterbox-tts", + "torch", + "torchaudio", +] + +[project.urls] +Homepage = "https://github.com/sudoxnym/wyoming-chatterbox" +Repository = "https://github.com/sudoxnym/wyoming-chatterbox" + +[project.scripts] +wyoming-chatterbox = "wyoming_chatterbox.__main__:main" + +[tool.setuptools.packages.find] +where = ["."] diff --git a/wyoming_chatterbox/__init__.py b/wyoming_chatterbox/__init__.py new file mode 100644 index 0000000..d3e160b --- /dev/null +++ b/wyoming_chatterbox/__init__.py @@ -0,0 +1,3 @@ +"""Wyoming server for Chatterbox TTS with voice cloning.""" + +__version__ = "1.0.0" diff --git a/wyoming_chatterbox/__main__.py b/wyoming_chatterbox/__main__.py new file mode 100644 index 0000000..9082536 --- /dev/null +++ b/wyoming_chatterbox/__main__.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +"""Wyoming server for Chatterbox TTS.""" + +import argparse +import asyncio +import logging +from functools import partial +from pathlib import Path + +from wyoming.server import AsyncServer + +from . import __version__ +from .handler import ChatterboxEventHandler + +_LOGGER = logging.getLogger(__name__) + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser(description="Wyoming Chatterbox TTS server") + parser.add_argument( + "--uri", + required=True, + help="Server URI (e.g., tcp://0.0.0.0:10201)", + ) + parser.add_argument( + "--voice-ref", + required=True, + help="Path to voice reference WAV file (10-30s of speech)", + ) + parser.add_argument( + "--volume-boost", + type=float, + default=3.0, + help="Output volume multiplier (default: 3.0)", + ) + parser.add_argument( + "--device", + default="cuda", + help="Torch device: cuda or cpu (default: cuda)", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug logging", + ) + parser.add_argument( + "--version", + action="version", + version=f"%(prog)s {__version__}", + ) + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.debug else logging.INFO, + format="%(levelname)s:%(name)s:%(message)s", + ) + + # Validate voice reference + voice_ref = Path(args.voice_ref) + if not voice_ref.exists(): + _LOGGER.error("Voice reference file not found: %s", voice_ref) + return 1 + + asyncio.run(run_server(args, str(voice_ref))) + return 0 + + +async def run_server(args, voice_ref: str): + """Run the Wyoming server.""" + _LOGGER.info("Loading Chatterbox model on %s...", args.device) + + from chatterbox.tts import ChatterboxTTS + + model = ChatterboxTTS.from_pretrained(device=args.device) + + _LOGGER.info("Warming up with voice: %s", voice_ref) + _ = model.generate("Ready.", audio_prompt_path=voice_ref) + + _LOGGER.info("Starting server at %s (volume boost: %.1fx)", args.uri, args.volume_boost) + + server = AsyncServer.from_uri(args.uri) + await server.run( + partial( + ChatterboxEventHandler, + model=model, + voice_ref=voice_ref, + volume_boost=args.volume_boost, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/wyoming_chatterbox/handler.py b/wyoming_chatterbox/handler.py new file mode 100644 index 0000000..66ef2b2 --- /dev/null +++ b/wyoming_chatterbox/handler.py @@ -0,0 +1,119 @@ +"""Wyoming event handler for Chatterbox TTS.""" + +import asyncio +import logging +from functools import partial + +import torch + +from wyoming.audio import AudioChunk, AudioStart, AudioStop +from wyoming.event import Event +from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, Describe +from wyoming.server import AsyncEventHandler +from wyoming.tts import Synthesize + +_LOGGER = logging.getLogger(__name__) + + +class ChatterboxEventHandler(AsyncEventHandler): + """Event handler for Chatterbox TTS.""" + + def __init__( + self, + reader, + writer, + model, + voice_ref: str, + sample_rate: int = 24000, + volume_boost: float = 3.0, + ): + super().__init__(reader, writer) + self.model = model + self.voice_ref = voice_ref + self.sample_rate = sample_rate + self.volume_boost = volume_boost + + async def handle_event(self, event: Event) -> bool: + """Handle Wyoming protocol events.""" + if Describe.is_type(event.type): + info = Info( + tts=[ + TtsProgram( + name="chatterbox", + description="Chatterbox TTS with voice cloning", + attribution=Attribution( + name="Resemble AI", + url="https://github.com/resemble-ai/chatterbox", + ), + installed=True, + version="1.0.0", + voices=[ + TtsVoice( + name="custom", + description="Custom cloned voice", + attribution=Attribution(name="Custom", url=""), + installed=True, + version="1.0.0", + languages=["en"], + ) + ], + ) + ] + ) + await self.write_event(info.event()) + return True + + if Synthesize.is_type(event.type): + synthesize = Synthesize.from_event(event) + text = synthesize.text + _LOGGER.info("Synthesizing: %s", text) + + # Generate audio in executor to avoid blocking + loop = asyncio.get_event_loop() + wav_tensor = await loop.run_in_executor( + None, + partial( + self.model.generate, text, audio_prompt_path=self.voice_ref + ), + ) + + # Convert to int16 PCM + wav_tensor = wav_tensor.cpu().squeeze() + if wav_tensor.dim() == 0: + wav_tensor = wav_tensor.unsqueeze(0) + + # Apply volume boost and clamp + wav_tensor = wav_tensor * self.volume_boost + wav_tensor = torch.clamp(wav_tensor, -1.0, 1.0) + wav_int16 = (wav_tensor * 32767).to(torch.int16) + audio_data = wav_int16.numpy().tobytes() + + sample_rate = self.sample_rate + sample_width = 2 # 16-bit + channels = 1 + + # Send audio start + await self.write_event( + AudioStart( + rate=sample_rate, width=sample_width, channels=channels + ).event() + ) + + # Send audio in chunks (100ms each) + chunk_size = sample_rate * sample_width * channels // 10 + for i in range(0, len(audio_data), chunk_size): + chunk = audio_data[i : i + chunk_size] + await self.write_event( + AudioChunk( + audio=chunk, + rate=sample_rate, + width=sample_width, + channels=channels, + ).event() + ) + + await self.write_event(AudioStop().event()) + _LOGGER.info("Synthesis complete") + return True + + return True