mirror of
https://github.com/sudoxnym/wyoming-chatterbox.git
synced 2026-04-14 03:27:06 +00:00
initial release - wyoming protocol server for chatterbox tts
features: - voice cloning with 10-30s audio sample - gpu-accelerated inference - volume boost option - pip installable 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
commit
bed25c51c7
7 changed files with 400 additions and 0 deletions
24
.gitignore
vendored
Normal file
24
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
# python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
dist/
|
||||
*.egg-info/
|
||||
.eggs/
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# audio files
|
||||
*.wav
|
||||
*.mp3
|
||||
|
||||
# ide
|
||||
.vscode/
|
||||
.idea/
|
||||
|
||||
# os
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 sudoxnym
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
98
README.md
Normal file
98
README.md
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
# wyoming-chatterbox
|
||||
|
||||
[wyoming protocol](https://github.com/rhasspy/wyoming) server for [chatterbox tts](https://github.com/resemble-ai/chatterbox) with voice cloning.
|
||||
|
||||
clone any voice with a 10-30 second audio sample.
|
||||
|
||||
## requirements
|
||||
|
||||
- nvidia gpu with 4gb+ vram
|
||||
- cuda 12.x
|
||||
- python 3.10+
|
||||
|
||||
## install
|
||||
|
||||
```bash
|
||||
pip install wyoming-chatterbox
|
||||
```
|
||||
|
||||
or from source:
|
||||
```bash
|
||||
git clone https://github.com/sudoxnym/wyoming-chatterbox
|
||||
cd wyoming-chatterbox
|
||||
pip install .
|
||||
```
|
||||
|
||||
## usage
|
||||
|
||||
```bash
|
||||
wyoming-chatterbox --uri tcp://0.0.0.0:10201 --voice-ref /path/to/voice_sample.wav
|
||||
```
|
||||
|
||||
### options
|
||||
|
||||
| option | default | description |
|
||||
|--------|---------|-------------|
|
||||
| `--uri` | required | server uri (e.g., `tcp://0.0.0.0:10201`) |
|
||||
| `--voice-ref` | required | path to voice reference wav (10-30s of speech) |
|
||||
| `--volume-boost` | 3.0 | output volume multiplier |
|
||||
| `--device` | cuda | torch device (`cuda` or `cpu`) |
|
||||
| `--debug` | false | enable debug logging |
|
||||
|
||||
## voice reference tips
|
||||
|
||||
for best results:
|
||||
- 10-30 seconds of clean speech
|
||||
- no background music or noise
|
||||
- consistent speaking style
|
||||
- wav format (any sample rate)
|
||||
|
||||
## systemd service
|
||||
|
||||
```bash
|
||||
sudo tee /etc/systemd/system/wyoming-chatterbox.service << 'EOF'
|
||||
[Unit]
|
||||
Description=Wyoming Chatterbox TTS
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=YOUR_USER
|
||||
Environment=PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||
ExecStart=/path/to/venv/bin/wyoming-chatterbox \
|
||||
--uri tcp://0.0.0.0:10201 \
|
||||
--voice-ref /path/to/voice_reference.wav \
|
||||
--volume-boost 3.0
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now wyoming-chatterbox
|
||||
```
|
||||
|
||||
## home assistant
|
||||
|
||||
1. settings → devices & services → add integration
|
||||
2. search "wyoming protocol"
|
||||
3. host: `YOUR_IP`, port: `10201`
|
||||
4. use in your voice assistant pipeline as tts
|
||||
|
||||
## gpu memory
|
||||
|
||||
chatterbox uses ~3.5gb vram. if you get oom errors:
|
||||
|
||||
```bash
|
||||
# check gpu usage
|
||||
nvidia-smi
|
||||
|
||||
# kill zombie processes
|
||||
pkill -f wyoming-chatterbox
|
||||
```
|
||||
|
||||
## license
|
||||
|
||||
mit
|
||||
41
pyproject.toml
Normal file
41
pyproject.toml
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "wyoming-chatterbox"
|
||||
version = "1.0.0"
|
||||
description = "Wyoming protocol server for Chatterbox TTS with voice cloning"
|
||||
readme = "README.md"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
{name = "sudoxnym"}
|
||||
]
|
||||
keywords = ["wyoming", "tts", "voice-cloning", "chatterbox", "home-assistant"]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
||||
]
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"wyoming>=1.5.0",
|
||||
"chatterbox-tts",
|
||||
"torch",
|
||||
"torchaudio",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/sudoxnym/wyoming-chatterbox"
|
||||
Repository = "https://github.com/sudoxnym/wyoming-chatterbox"
|
||||
|
||||
[project.scripts]
|
||||
wyoming-chatterbox = "wyoming_chatterbox.__main__:main"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
3
wyoming_chatterbox/__init__.py
Normal file
3
wyoming_chatterbox/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
"""Wyoming server for Chatterbox TTS with voice cloning."""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
94
wyoming_chatterbox/__main__.py
Normal file
94
wyoming_chatterbox/__main__.py
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Wyoming server for Chatterbox TTS."""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
from wyoming.server import AsyncServer
|
||||
|
||||
from . import __version__
|
||||
from .handler import ChatterboxEventHandler
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(description="Wyoming Chatterbox TTS server")
|
||||
parser.add_argument(
|
||||
"--uri",
|
||||
required=True,
|
||||
help="Server URI (e.g., tcp://0.0.0.0:10201)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--voice-ref",
|
||||
required=True,
|
||||
help="Path to voice reference WAV file (10-30s of speech)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--volume-boost",
|
||||
type=float,
|
||||
default=3.0,
|
||||
help="Output volume multiplier (default: 3.0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default="cuda",
|
||||
help="Torch device: cuda or cpu (default: cuda)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
help="Enable debug logging",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version=f"%(prog)s {__version__}",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.debug else logging.INFO,
|
||||
format="%(levelname)s:%(name)s:%(message)s",
|
||||
)
|
||||
|
||||
# Validate voice reference
|
||||
voice_ref = Path(args.voice_ref)
|
||||
if not voice_ref.exists():
|
||||
_LOGGER.error("Voice reference file not found: %s", voice_ref)
|
||||
return 1
|
||||
|
||||
asyncio.run(run_server(args, str(voice_ref)))
|
||||
return 0
|
||||
|
||||
|
||||
async def run_server(args, voice_ref: str):
|
||||
"""Run the Wyoming server."""
|
||||
_LOGGER.info("Loading Chatterbox model on %s...", args.device)
|
||||
|
||||
from chatterbox.tts import ChatterboxTTS
|
||||
|
||||
model = ChatterboxTTS.from_pretrained(device=args.device)
|
||||
|
||||
_LOGGER.info("Warming up with voice: %s", voice_ref)
|
||||
_ = model.generate("Ready.", audio_prompt_path=voice_ref)
|
||||
|
||||
_LOGGER.info("Starting server at %s (volume boost: %.1fx)", args.uri, args.volume_boost)
|
||||
|
||||
server = AsyncServer.from_uri(args.uri)
|
||||
await server.run(
|
||||
partial(
|
||||
ChatterboxEventHandler,
|
||||
model=model,
|
||||
voice_ref=voice_ref,
|
||||
volume_boost=args.volume_boost,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
119
wyoming_chatterbox/handler.py
Normal file
119
wyoming_chatterbox/handler.py
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
"""Wyoming event handler for Chatterbox TTS."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
|
||||
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
||||
from wyoming.event import Event
|
||||
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, Describe
|
||||
from wyoming.server import AsyncEventHandler
|
||||
from wyoming.tts import Synthesize
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChatterboxEventHandler(AsyncEventHandler):
|
||||
"""Event handler for Chatterbox TTS."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
reader,
|
||||
writer,
|
||||
model,
|
||||
voice_ref: str,
|
||||
sample_rate: int = 24000,
|
||||
volume_boost: float = 3.0,
|
||||
):
|
||||
super().__init__(reader, writer)
|
||||
self.model = model
|
||||
self.voice_ref = voice_ref
|
||||
self.sample_rate = sample_rate
|
||||
self.volume_boost = volume_boost
|
||||
|
||||
async def handle_event(self, event: Event) -> bool:
|
||||
"""Handle Wyoming protocol events."""
|
||||
if Describe.is_type(event.type):
|
||||
info = Info(
|
||||
tts=[
|
||||
TtsProgram(
|
||||
name="chatterbox",
|
||||
description="Chatterbox TTS with voice cloning",
|
||||
attribution=Attribution(
|
||||
name="Resemble AI",
|
||||
url="https://github.com/resemble-ai/chatterbox",
|
||||
),
|
||||
installed=True,
|
||||
version="1.0.0",
|
||||
voices=[
|
||||
TtsVoice(
|
||||
name="custom",
|
||||
description="Custom cloned voice",
|
||||
attribution=Attribution(name="Custom", url=""),
|
||||
installed=True,
|
||||
version="1.0.0",
|
||||
languages=["en"],
|
||||
)
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
await self.write_event(info.event())
|
||||
return True
|
||||
|
||||
if Synthesize.is_type(event.type):
|
||||
synthesize = Synthesize.from_event(event)
|
||||
text = synthesize.text
|
||||
_LOGGER.info("Synthesizing: %s", text)
|
||||
|
||||
# Generate audio in executor to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
wav_tensor = await loop.run_in_executor(
|
||||
None,
|
||||
partial(
|
||||
self.model.generate, text, audio_prompt_path=self.voice_ref
|
||||
),
|
||||
)
|
||||
|
||||
# Convert to int16 PCM
|
||||
wav_tensor = wav_tensor.cpu().squeeze()
|
||||
if wav_tensor.dim() == 0:
|
||||
wav_tensor = wav_tensor.unsqueeze(0)
|
||||
|
||||
# Apply volume boost and clamp
|
||||
wav_tensor = wav_tensor * self.volume_boost
|
||||
wav_tensor = torch.clamp(wav_tensor, -1.0, 1.0)
|
||||
wav_int16 = (wav_tensor * 32767).to(torch.int16)
|
||||
audio_data = wav_int16.numpy().tobytes()
|
||||
|
||||
sample_rate = self.sample_rate
|
||||
sample_width = 2 # 16-bit
|
||||
channels = 1
|
||||
|
||||
# Send audio start
|
||||
await self.write_event(
|
||||
AudioStart(
|
||||
rate=sample_rate, width=sample_width, channels=channels
|
||||
).event()
|
||||
)
|
||||
|
||||
# Send audio in chunks (100ms each)
|
||||
chunk_size = sample_rate * sample_width * channels // 10
|
||||
for i in range(0, len(audio_data), chunk_size):
|
||||
chunk = audio_data[i : i + chunk_size]
|
||||
await self.write_event(
|
||||
AudioChunk(
|
||||
audio=chunk,
|
||||
rate=sample_rate,
|
||||
width=sample_width,
|
||||
channels=channels,
|
||||
).event()
|
||||
)
|
||||
|
||||
await self.write_event(AudioStop().event())
|
||||
_LOGGER.info("Synthesis complete")
|
||||
return True
|
||||
|
||||
return True
|
||||
Loading…
Reference in a new issue