import { cancelTtsPlay, eventSource, event_types, getCurrentChatId, isStreamingEnabled, name2, saveSettingsDebounced, substituteParams } from '../../../script.js';
import { ModuleWorkerWrapper, extension_settings, getContext, renderExtensionTemplateAsync } from '../../extensions.js';
import { delay, escapeRegex, getBase64Async, getStringHash, onlyUnique, regexFromString } from '../../utils.js';
import { accountStorage } from '../../util/AccountStorage.js';
import { EdgeTtsProvider } from './edge.js';
import { ElevenLabsTtsProvider } from './elevenlabs.js';
import { SileroTtsProvider } from './silerotts.js';
import { GptSovitsV2Provider } from './gpt-sovits-v2.js';
import { GptSoVITSAdapterProvider } from './gpt-sovits-adapter.js';
import { CoquiTtsProvider } from './coqui.js';
import { SystemTtsProvider } from './system.js';
import { NovelTtsProvider } from './novel.js';
import { power_user } from '../../power-user.js';
import { OpenAITtsProvider } from './openai.js';
import { OpenAICompatibleTtsProvider } from './openai-compatible.js';
import { XTTSTtsProvider } from './xtts.js';
import { VITSTtsProvider } from './vits.js';
import { GSVITtsProvider } from './gsvi.js';
import { SBVits2TtsProvider } from './sbvits2.js';
import { AllTalkTtsProvider } from './alltalk.js';
import { CosyVoiceProvider } from './cosyvoice.js';
import { SpeechT5TtsProvider } from './speecht5.js';
import { AzureTtsProvider } from './azure.js';
import { SlashCommandParser } from '../../slash-commands/SlashCommandParser.js';
import { SlashCommand } from '../../slash-commands/SlashCommand.js';
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js';
import { debounce_timeout } from '../../constants.js';
import { SlashCommandEnumValue, enumTypes } from '../../slash-commands/SlashCommandEnumValue.js';
import { enumIcons } from '../../slash-commands/SlashCommandCommonEnumsProvider.js';
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
import { GoogleTranslateTtsProvider } from './google-translate.js';
import { GoogleNativeTtsProvider } from './google-native.js';
import { ChatterboxTtsProvider } from './chatterbox.js';
import { KokoroTtsProvider } from './kokoro.js';
import { TtsWebuiProvider } from './tts-webui.js';
import { PollinationsTtsProvider } from './pollinations.js';
import { MiniMaxTtsProvider } from './minimax.js';
import { ElectronHubTtsProvider } from './electronhub.js';
import { ChutesTtsProvider } from './chutes.js';
import { VolcengineTtsProvider } from './volcengine.js';
import { applyLocale, t } from '/scripts/i18n.js';
const UPDATE_INTERVAL = 1000;
const wrapper = new ModuleWorkerWrapper(moduleWorker);
let voiceMapEntries = [];
let voiceMap = {}; // {charName:voiceid, charName2:voiceid2}
let lastChatId = null;
let lastMessage = null;
let lastMessageHash = null;
let periodicMessageGenerationTimer = null;
let lastPositionOfParagraphEnd = -1;
let currentInitVoiceMapPromise = null;
const DEFAULT_VOICE_MARKER = '[Default Voice]';
const DISABLED_VOICE_MARKER = 'disabled';
export function getPreviewString(lang) {
const previewStrings = {
'en-US': 'The quick brown fox jumps over the lazy dog',
'en-GB': 'Sphinx of black quartz, judge my vow',
'fr-FR': 'Portez ce vieux whisky au juge blond qui fume',
'de-DE': 'Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich',
'it-IT': 'Pranzo d\'acqua fa volti sghembi',
'es-ES': 'Quiere la boca exhausta vid, kiwi, piña y fugaz jamón',
'es-MX': 'Fabio me exige, sin tapujos, que añada cerveza al whisky',
'ru-RU': 'В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!',
'pt-BR': 'Vejo xá gritando que fez show sem playback.',
'pt-PR': 'Todo pajé vulgar faz boquinha sexy com kiwi.',
'uk-UA': 'Фабрикуймо гідність, лящім їжею, ґав хапаймо, з\'єднавці чаш!',
'pl-PL': 'Pchnąć w tę łódź jeża lub ośm skrzyń fig',
'cs-CZ': 'Příliš žluťoučký kůň úpěl ďábelské ódy',
'sk-SK': 'Vyhŕňme si rukávy a vyprážajme čínske ryžové cestoviny',
'hu-HU': 'Árvíztűrő tükörfúrógép',
'tr-TR': 'Pijamalı hasta yağız şoföre çabucak güvendi',
'nl-NL': 'De waard heeft een kalfje en een pinkje opgegeten',
'sv-SE': 'Yxskaftbud, ge vårbygd, zinkqvarn',
'da-DK': 'Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Walther spillede på xylofon',
'ja-JP': 'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす',
'ko-KR': '가나다라마바사아자차카타파하',
'zh-CN': '我能吞下玻璃而不伤身体',
'ro-RO': 'Muzicologă în bej vând whisky și tequila, preț fix',
'bg-BG': 'Щъркелите се разпръснаха по цялото небе',
'el-GR': 'Ταχίστη αλώπηξ βαφής ψημένη γη, δρασκελίζει υπέρ νωθρού κυνός',
'fi-FI': 'Voi veljet, miksi juuri teille myin nämä vehkeet?',
'he-IL': 'הקצינים צעקו: "כל הכבוד לצבא הצבאות!"',
'id-ID': 'Jangkrik itu memang enak, apalagi kalau digoreng',
'ms-MY': 'Muzik penyanyi wanita itu menggambarkan kehidupan yang penuh dengan duka nestapa',
'th-TH': 'เป็นไงบ้างครับ ผมชอบกินข้าวผัดกระเพราหมูกรอบ',
'vi-VN': 'Cô bé quàng khăn đỏ đang ngồi trên bãi cỏ xanh',
'ar-SA': 'أَبْجَدِيَّة عَرَبِيَّة',
'hi-IN': 'श्वेता ने श्वेता के श्वेते हाथों में श्वेता का श्वेता चावल पकड़ा',
};
const fallbackPreview = 'Neque porro quisquam est qui dolorem ipsum quia dolor sit amet';
return previewStrings[lang] ?? fallbackPreview;
}
/**
* Registers a TTS provider.
* @param {string} name Name of the TTS provider to register.
* @param {function} provider Provider class.
*/
export function registerTtsProvider(name, provider) {
if (!name || typeof name !== 'string') {
throw new Error(`TTS provider name ${name} is not a valid string.`);
}
if (!provider || typeof provider !== 'function') {
throw new Error(`TTS provider ${name} is not a valid provider class.`);
}
if (ttsProviders[name]) {
throw new Error(`TTS provider ${name} is already registered.`);
}
ttsProviders[name] = provider;
console.info(`Registered TTS provider: ${name}`);
$('#tts_provider').append($('').val(name).text(name));
// Load if it was previously selected
if (extension_settings.tts.currentProvider === name) {
loadTtsProvider(name);
}
}
const ttsProviders = {
AllTalk: AllTalkTtsProvider,
Azure: AzureTtsProvider,
Chatterbox: ChatterboxTtsProvider,
Chutes: ChutesTtsProvider,
Coqui: CoquiTtsProvider,
'CosyVoice (Unofficial)': CosyVoiceProvider,
Edge: EdgeTtsProvider,
ElevenLabs: ElevenLabsTtsProvider,
'Electron Hub': ElectronHubTtsProvider,
'Google Translate': GoogleTranslateTtsProvider,
'Google Gemini TTS': GoogleNativeTtsProvider,
GSVI: GSVITtsProvider,
'GPT-SoVITS-Adapter': GptSoVITSAdapterProvider,
'GPT-SoVITS-V2 (Unofficial)': GptSovitsV2Provider,
Kokoro: KokoroTtsProvider,
MiniMax: MiniMaxTtsProvider,
Novel: NovelTtsProvider,
OpenAI: OpenAITtsProvider,
'OpenAI Compatible': OpenAICompatibleTtsProvider,
Pollinations: PollinationsTtsProvider,
SBVits2: SBVits2TtsProvider,
Silero: SileroTtsProvider,
SpeechT5: SpeechT5TtsProvider,
System: SystemTtsProvider,
'TTS WebUI': TtsWebuiProvider,
VITS: VITSTtsProvider,
XTTSv2: XTTSTtsProvider,
Volcengine: VolcengineTtsProvider,
};
let ttsProvider;
let ttsProviderName;
async function onNarrateOneMessage() {
audioElement.src = '/sounds/silence.mp3';
const context = getContext();
const id = $(this).closest('.mes').attr('mesid');
const message = context.chat[id];
if (!message) {
return;
}
resetTtsPlayback();
processAndQueueTtsMessage(message, Number(id), { manual: true });
moduleWorker();
}
async function onNarrateText(args, text) {
if (!text) {
return '';
}
audioElement.src = '/sounds/silence.mp3';
// To load all characters in the voice map, set unrestricted to true
await initVoiceMap(true);
const baseName = args?.voice || name2;
const name = (baseName === 'SillyTavern System' ? DEFAULT_VOICE_MARKER : baseName) || DEFAULT_VOICE_MARKER;
const voiceMapEntry = voiceMap[name] === DEFAULT_VOICE_MARKER
? voiceMap[DEFAULT_VOICE_MARKER]
: voiceMap[name];
if (voiceMapEntry === DISABLED_VOICE_MARKER) {
toastr.info(`TTS voice for ${name} is disabled.`);
await initVoiceMap(false);
return;
}
if (!voiceMapEntry) {
toastr.info(`Specified voice for ${name} was not found. Check the TTS extension settings.`);
await initVoiceMap(false);
return;
}
resetTtsPlayback();
processAndQueueTtsMessage({ mes: text, name: name }, null, { manual: true });
await moduleWorker();
// Return back to the chat voices
await initVoiceMap(false);
return '';
}
async function moduleWorker() {
if (!extension_settings.tts.enabled) {
return;
}
processTtsQueue();
processAudioJobQueue();
updateUiAudioPlayState();
}
function resetTtsPlayback() {
// Stop system TTS utterance
cancelTtsPlay();
// Clear currently processing jobs
currentTtsJob = null;
currentAudioJob = null;
// Reset audio element
audioElement.currentTime = 0;
audioElement.src = '';
// Clear any queue items
ttsJobQueue.splice(0, ttsJobQueue.length);
audioJobQueue.splice(0, audioJobQueue.length);
// Set audio ready to process again
audioQueueProcessorReady = true;
}
function isTtsProcessing() {
let processing = false;
// Check job queues
if (ttsJobQueue.length > 0 || audioJobQueue.length > 0) {
processing = true;
}
// Check current jobs
if (currentTtsJob != null || currentAudioJob != null) {
processing = true;
}
return processing;
}
/**
* @typedef {ChatMessage & { id?: number, manual?: boolean, segmentText?: string, segmentType?: string }} TtsMessage
*/
/**
* Clones a message, attaches the given message ID, then splits by paragraphs
* (if enabled) and adds each part to the TTS job queue.
* @param {ChatMessage} message - The message object to be processed.
* @param {number|null} [messageId=null] - The chat message index to associate with TTS events.
* @param {object} [options={}] - Additional options for processing.
* @param {boolean} [options.manual=false] - Whether this TTS job was manually triggered (e.g., from the UI) rather than automatically from a new chat message.
* @returns {void}
*/
function processAndQueueTtsMessage(message, messageId = null, { manual = false } = {}) {
/** @type {TtsMessage} */
const clone = structuredClone(message);
clone.id = messageId ?? null;
clone.manual = manual ?? false;
if (!extension_settings.tts.narrate_by_paragraphs) {
ttsJobQueue.push(clone);
return;
}
const lines = clone.mes.split('\n');
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line.length === 0) {
continue;
}
ttsJobQueue.push(
Object.assign({}, clone, {
mes: line,
}),
);
}
}
function debugTtsPlayback() {
console.log(JSON.stringify(
{
'ttsProviderName': ttsProviderName,
'voiceMap': voiceMap,
'audioPaused': audioPaused,
'audioJobQueue': audioJobQueue,
'currentAudioJob': currentAudioJob,
'audioQueueProcessorReady': audioQueueProcessorReady,
'ttsJobQueue': ttsJobQueue,
'currentTtsJob': currentTtsJob,
'ttsConfig': extension_settings.tts,
},
));
}
globalThis.debugTtsPlayback = debugTtsPlayback;
//##################//
// Audio Control //
//##################//
let audioElement = new Audio();
audioElement.id = 'tts_audio';
audioElement.autoplay = true;
/**
* @type AudioJob[] Audio job queue
* @typedef {{audioBlob: Blob | string, char: string}} AudioJob Audio job object
*/
const audioJobQueue = [];
/**
* @type AudioJob Current audio job
*/
let currentAudioJob;
let audioPaused = false;
let audioQueueProcessorReady = true;
/**
* Play audio data from audio job object.
* @param {AudioJob} audioJob Audio job object
* @returns {Promise} Promise that resolves when audio playback is started
*/
async function playAudioData(audioJob) {
const { audioBlob, char } = audioJob;
// Since current audio job can be cancelled, don't playback if it is null
if (currentAudioJob == null) {
console.log('Cancelled TTS playback because currentAudioJob was null');
}
if (audioBlob instanceof Blob) {
const srcUrl = await getBase64Async(audioBlob);
// VRM lip sync
if (extension_settings.vrm?.enabled && typeof globalThis.vrmLipSync === 'function') {
await globalThis.vrmLipSync(audioBlob, char);
}
audioElement.src = srcUrl;
} else if (typeof audioBlob === 'string') {
audioElement.src = audioBlob;
} else {
throw `TTS received invalid audio data type ${typeof audioBlob}`;
}
audioElement.addEventListener('ended', completeCurrentAudioJob);
audioElement.addEventListener('canplay', () => {
console.debug('Starting TTS playback');
audioElement.playbackRate = extension_settings.tts.playback_rate;
audioElement.play();
});
}
globalThis.tts_preview = function (id) {
const audio = document.getElementById(id);
if (audio instanceof HTMLAudioElement && !$(audio).data('disabled')) {
audio.play();
} else {
ttsProvider.previewTtsVoice(id);
}
};
async function onTtsVoicesClick() {
let popupText = '';
try {
const voiceIds = await ttsProvider.fetchTtsVoiceObjects();
for (const voice of voiceIds) {
popupText += `
${voice.lang || ''}${voice.name}
`;
if (voice.preview_url) {
popupText += ``;
}
}
} catch {
popupText = 'Could not load voices list. Check your API key.';
}
callGenericPopup(popupText, POPUP_TYPE.TEXT, '', { allowVerticalScrolling: true });
}
function updateUiAudioPlayState() {
if (extension_settings.tts.enabled == true) {
$('#ttsExtensionMenuItem').show();
let img;
// Give user feedback that TTS is active by setting the stop icon if processing or playing
if (!audioElement.paused || isTtsProcessing()) {
img = 'fa-solid fa-stop-circle extensionsMenuExtensionButton';
} else {
img = 'fa-solid fa-circle-play extensionsMenuExtensionButton';
}
$('#tts_media_control').attr('class', img);
} else {
$('#ttsExtensionMenuItem').hide();
}
}
function onAudioControlClicked() {
audioElement.src = '/sounds/silence.mp3';
let context = getContext();
// Not pausing, doing a full stop to anything TTS is doing. Better UX as pause is not as useful
if (!audioElement.paused || isTtsProcessing()) {
resetTtsPlayback();
} else if (context?.chat?.length > 0) {
// Default play behavior if not processing or playing is to play the last message.
const id = context.chat.length - 1;
processAndQueueTtsMessage(context.chat[id], id, { manual: true });
}
updateUiAudioPlayState();
}
function addAudioControl() {
$('#tts_wand_container').append(applyLocale(`