Volcengine tts (#5003)
* feat(tts): Add support for Volcengine TTS provider * refactor: Remove the redundant comments in the Volcengine TTS-related code. * fix(volcengine): Fix the audio data processing logic in the voice generation interface * feat(tts): Enhance Volcengine TTS functionality and improve error handling - Return more detailed error information when generating voice fails - Add multiple preset voice options and support custom voice management - Reconstruct the audio stream processing logic to enhance reliability - Improve the UI interface, adding a voice selection dropdown menu and operation buttons * refactor(tts): Optimize the code structure and error handling of the Volcengine TTS provider - Remove the unused "voices" array and "model" parameter - Improve the text processing logic, eliminating unnecessary separators - Standardize the error handling logic, simplifying the status code checks - Fix the DOM operation method, using "createElement" instead of string concatenation - Ensure the existence check of the "customVoices" array * Fix: Change the Content-Type of the audio response to audio/mpeg. * Clean-up --------- Co-authored-by: Cohee <18619528+Cohee1207@users.noreply.github.com>
This commit is contained in:
@@ -36,6 +36,7 @@ import { PollinationsTtsProvider } from './pollinations.js';
|
||||
import { MiniMaxTtsProvider } from './minimax.js';
|
||||
import { ElectronHubTtsProvider } from './electronhub.js';
|
||||
import { ChutesTtsProvider } from './chutes.js';
|
||||
import { VolcengineTtsProvider } from './volcengine.js';
|
||||
|
||||
const UPDATE_INTERVAL = 1000;
|
||||
const wrapper = new ModuleWorkerWrapper(moduleWorker);
|
||||
@@ -146,6 +147,7 @@ const ttsProviders = {
|
||||
'TTS WebUI': TtsWebuiProvider,
|
||||
VITS: VITSTtsProvider,
|
||||
XTTSv2: XTTSTtsProvider,
|
||||
Volcengine: VolcengineTtsProvider,
|
||||
};
|
||||
let ttsProvider;
|
||||
let ttsProviderName;
|
||||
|
||||
@@ -0,0 +1,316 @@
|
||||
import { event_types, eventSource, getRequestHeaders } from '../../../script.js';
|
||||
import { SECRET_KEYS, secret_state } from '../../secrets.js';
|
||||
import { saveTtsProviderSettings, initVoiceMap } from './index.js';
|
||||
import { Popup } from '../../popup.js';
|
||||
export { VolcengineTtsProvider };
|
||||
|
||||
class VolcengineTtsProvider {
|
||||
static voices = [
|
||||
{
|
||||
name: 'zh_female_xiaohe_uranus_bigtts',
|
||||
voice_id: 'zh_female_xiaohe_uranus_bigtts',
|
||||
lang: 'cl',
|
||||
},
|
||||
{
|
||||
name: 'zh_female_vv_uranus_bigtts',
|
||||
voice_id: 'zh_female_vv_uranus_bigtts',
|
||||
lang: 'cl',
|
||||
},
|
||||
{
|
||||
name: 'saturn_zh_female_keainvsheng_tob',
|
||||
voice_id: 'saturn_zh_female_keainvsheng_tob',
|
||||
lang: 'cl',
|
||||
},
|
||||
{
|
||||
name: 'saturn_zh_female_tiaopigongzhu_tob',
|
||||
voice_id: 'saturn_zh_female_tiaopigongzhu_tob',
|
||||
lang: 'cl',
|
||||
},
|
||||
{
|
||||
name: 'saturn_zh_female_cancan_tob',
|
||||
voice_id: 'saturn_zh_female_cancan_tob',
|
||||
lang: 'cl',
|
||||
},
|
||||
{
|
||||
name: 'saturn_zh_male_shuanglangshaonian_tob',
|
||||
voice_id: 'saturn_zh_male_shuanglangshaonian_tob',
|
||||
lang: 'cl',
|
||||
},
|
||||
{
|
||||
name: 'saturn_zh_male_tiancaitongzhuo_tob',
|
||||
voice_id: 'saturn_zh_male_tiancaitongzhuo_tob',
|
||||
lang: 'cl',
|
||||
},
|
||||
{
|
||||
name: 'zh_male_taocheng_uranus_bigtts',
|
||||
voice_id: 'zh_male_taocheng_uranus_bigtts',
|
||||
lang: 'cl',
|
||||
},
|
||||
];
|
||||
settings;
|
||||
audioElement = document.createElement('audio');
|
||||
defaultSettings = {
|
||||
voiceMap: {},
|
||||
customVoices: [],
|
||||
resource_id: '',
|
||||
speed: 0,
|
||||
provider_endpoint: 'https://openspeech.bytedance.com/api/v3/tts/unidirectional',
|
||||
};
|
||||
|
||||
processText(text) {
|
||||
return text.split('...').join('');
|
||||
}
|
||||
|
||||
constructor() {
|
||||
this.handler = async function (/** @type {string} */ key) {
|
||||
if (![SECRET_KEYS.VOLCENGINE_APP_ID, SECRET_KEYS.VOLCENGINE_ACCESS_KEY].includes(key)) return;
|
||||
$('#volcengine-tts-app-id').toggleClass('success', !!secret_state[SECRET_KEYS.VOLCENGINE_APP_ID]);
|
||||
$('#volcengine-tts-access-key').toggleClass('success', !!secret_state[SECRET_KEYS.VOLCENGINE_ACCESS_KEY]);
|
||||
await this.onRefreshClick();
|
||||
}.bind(this);
|
||||
}
|
||||
|
||||
dispose() {
|
||||
[event_types.SECRET_WRITTEN, event_types.SECRET_DELETED, event_types.SECRET_ROTATED].forEach(event => {
|
||||
eventSource.removeListener(event, this.handler);
|
||||
});
|
||||
}
|
||||
|
||||
async previewTtsVoice(voice) {
|
||||
const text = 'Hello! Nice to meet you!';
|
||||
const audio = await this.generateTts(text, voice);
|
||||
const audioElement = new Audio(URL.createObjectURL(await audio.blob()));
|
||||
audioElement.play().catch(e => console.error('Error playing audio:', e));
|
||||
}
|
||||
|
||||
async fetchTtsVoiceObjects() {
|
||||
return this.getAllVoices();
|
||||
}
|
||||
|
||||
get settingsHtml() {
|
||||
let html = `
|
||||
<div>Volcengine (Doubao) TTS Configuration.</div>
|
||||
<small>Hint: Volcengine (Doubao) TTS configuration items.</small>
|
||||
<small>Please refer to the <a href="https://www.volcengine.com/docs/6561/1598757" target="_blank">documentation</a> to obtain the configuration items.</small>
|
||||
<div class="flex-container alignItemsCenter">
|
||||
<div id="volcengine-tts-app-id" class="menu_button menu_button_icon manage-api-keys" data-key="volcengine_app_id">
|
||||
<i class="fa-solid fa-key"></i>
|
||||
<span>App ID</span>
|
||||
</div>
|
||||
<div id="volcengine-tts-access-key" class="menu_button menu_button_icon manage-api-keys" data-key="volcengine_access_key">
|
||||
<i class="fa-solid fa-key"></i>
|
||||
<span>Access Key</span>
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
<label for="volcengine-tts-resource-id">Resource ID:</label>
|
||||
<input type="text" class="text_pole" id="volcengine-tts-resource-id">
|
||||
</div>
|
||||
<label for="volcengine-tts-voice">Custom Voice (Speaker):</label>
|
||||
<div class="tts_custom_voices">
|
||||
<select id="volcengine-tts-voice-select">
|
||||
</select>
|
||||
<i title="Add" id="volcengine-tts-add-voice" class="tts-button fa-solid fa-plus fa-xl success" role="button"></i>
|
||||
<i title="Delete" id="volcengine-tts-delete-voice" class="tts-button fa-solid fa-xmark fa-xl failure" tabindex="0" role="button"></i>
|
||||
</div>
|
||||
<div>
|
||||
<label for="volcengine-tts-speed">Speed:</label>
|
||||
<div class="flex-container">
|
||||
<div class="range-block-range">
|
||||
<input type="range" id="volcengine-tts-speed" min="-50" max="100" step="1">
|
||||
</div>
|
||||
<div class="range-block-counter">
|
||||
<input type="number" min="-50" max="100" step="1" data-for="volcengine-tts-speed" id="volcengine-tts-speed_counter">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
<label for="volcengine-tts-provider-endpoint">Provider Endpoint:</label>
|
||||
<input type="text" class="text_pole" id="volcengine-tts-provider-endpoint">
|
||||
</div>
|
||||
`;
|
||||
return html;
|
||||
}
|
||||
|
||||
async getVoice(voiceName) {
|
||||
const allVoices = this.getAllVoices();
|
||||
return allVoices.find(voice => voice.name == voiceName);
|
||||
}
|
||||
|
||||
getAllVoices() {
|
||||
const voices = [...VolcengineTtsProvider.voices];
|
||||
|
||||
for (const customVoice of this.settings.customVoices) {
|
||||
voices.push({
|
||||
name: customVoice,
|
||||
voice_id: customVoice,
|
||||
lang: 'cl',
|
||||
});
|
||||
}
|
||||
|
||||
return voices;
|
||||
}
|
||||
|
||||
populateVoices() {
|
||||
const voiceSelect = $('#volcengine-tts-voice-select');
|
||||
|
||||
voiceSelect.empty();
|
||||
|
||||
for (const customVoice of this.settings.customVoices) {
|
||||
const option = document.createElement('option');
|
||||
option.value = customVoice;
|
||||
option.textContent = customVoice;
|
||||
voiceSelect.append(option);
|
||||
}
|
||||
}
|
||||
|
||||
async onRefreshClick() {
|
||||
return await this.checkReady();
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
this.settings.resource_id = $('#volcengine-tts-resource-id').val();
|
||||
this.settings.speed = $('#volcengine-tts-speed').val();
|
||||
this.settings.provider_endpoint = $('#volcengine-tts-provider-endpoint').val();
|
||||
saveTtsProviderSettings();
|
||||
this.changeTTSSettings();
|
||||
}
|
||||
|
||||
async changeTTSSettings() {
|
||||
const speed = this.settings.speed;
|
||||
$('#volcengine-tts-speed').val(speed);
|
||||
$('#volcengine-tts-speed_counter').val(speed);
|
||||
}
|
||||
|
||||
async loadSettings(settings) {
|
||||
// Populate Provider UI given input settings
|
||||
if (Object.keys(settings).length == 0) {
|
||||
console.info('Using default TTS Provider settings');
|
||||
}
|
||||
|
||||
// Only accept keys defined in defaultSettings
|
||||
this.settings = { ...this.defaultSettings };
|
||||
|
||||
for (const key in settings) {
|
||||
if (key in this.settings) {
|
||||
this.settings[key] = settings[key];
|
||||
} else {
|
||||
throw `Invalid setting passed to TTS Provider: ${key}`;
|
||||
}
|
||||
}
|
||||
|
||||
// Set initial values from the settings
|
||||
$('#volcengine-tts-resource-id').val(this.settings.resource_id).on('change', this.onSettingsChange.bind(this));
|
||||
$('#volcengine-tts-add-voice').on('click', this.createNewVoice.bind(this));
|
||||
$('#volcengine-tts-delete-voice').on('click', this.deleteSelectedVoice.bind(this));
|
||||
|
||||
// Ensure custom configuration arrays exist
|
||||
if (!this.settings.customVoices) this.settings.customVoices = [];
|
||||
|
||||
|
||||
this.populateVoices();
|
||||
|
||||
// Speed control - range and number inputs
|
||||
const speedInput = $('#volcengine-tts-speed');
|
||||
const speedCounter = $('#volcengine-tts-speed_counter');
|
||||
|
||||
speedInput.val(this.settings.speed).on('input change', (e) => {
|
||||
const value = $(e.target).val();
|
||||
speedCounter.val(value);
|
||||
this.settings.speed = value;
|
||||
saveTtsProviderSettings();
|
||||
this.changeTTSSettings();
|
||||
});
|
||||
|
||||
speedCounter.val(this.settings.speed).on('input change', (e) => {
|
||||
const value = $(e.target).val();
|
||||
speedInput.val(value);
|
||||
this.settings.speed = value;
|
||||
saveTtsProviderSettings();
|
||||
this.changeTTSSettings();
|
||||
});
|
||||
|
||||
$('#volcengine-tts-provider-endpoint').val(this.settings.provider_endpoint).on('change', this.onSettingsChange.bind(this));
|
||||
|
||||
// Initialize secret keys UI
|
||||
$('#volcengine-tts-app-id').toggleClass('success', !!secret_state[SECRET_KEYS.VOLCENGINE_APP_ID]);
|
||||
$('#volcengine-tts-access-key').toggleClass('success', !!secret_state[SECRET_KEYS.VOLCENGINE_ACCESS_KEY]);
|
||||
[event_types.SECRET_WRITTEN, event_types.SECRET_DELETED, event_types.SECRET_ROTATED].forEach(event => {
|
||||
eventSource.on(event, this.handler);
|
||||
});
|
||||
|
||||
await this.checkReady();
|
||||
|
||||
console.info('Volcengine TTS: Settings loaded');
|
||||
}
|
||||
|
||||
async createNewVoice() {
|
||||
const name = await Popup.show.input('Voice name: ', null);
|
||||
if (!name) {
|
||||
return;
|
||||
}
|
||||
if (this.settings.customVoices.includes(name)) {
|
||||
toastr.error('Voice name should be unique.');
|
||||
return;
|
||||
}
|
||||
this.settings.customVoices.push(name);
|
||||
this.populateVoices();
|
||||
initVoiceMap();
|
||||
saveTtsProviderSettings();
|
||||
}
|
||||
|
||||
async deleteSelectedVoice() {
|
||||
const selectedVoiceName = $('#volcengine-tts-voice-select').val();
|
||||
|
||||
if (!selectedVoiceName) {
|
||||
toastr.error('Please select a voice first.');
|
||||
return;
|
||||
}
|
||||
|
||||
const confirm = await Popup.show.confirm(`Are you sure you want to delete the selected voice ${selectedVoiceName}?`);
|
||||
if (!confirm) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
const voiceIndex = this.settings.customVoices.indexOf(selectedVoiceName);
|
||||
if (voiceIndex !== -1) {
|
||||
this.settings.customVoices.splice(voiceIndex, 1);
|
||||
}
|
||||
|
||||
this.populateVoices();
|
||||
initVoiceMap();
|
||||
saveTtsProviderSettings();
|
||||
}
|
||||
|
||||
async checkReady() {
|
||||
await Promise.allSettled([this.changeTTSSettings()]);
|
||||
}
|
||||
|
||||
async generateTts(text, speaker) {
|
||||
const response = await this.fetchTtsGeneration(text, speaker);
|
||||
return response;
|
||||
}
|
||||
async fetchTtsGeneration(text, voice_speaker) {
|
||||
console.info(`Generating new TTS for voice_id ${voice_speaker}`);
|
||||
const response = await fetch('/api/volcengine/generate-voice', {
|
||||
method: 'POST',
|
||||
headers: getRequestHeaders(),
|
||||
body: JSON.stringify({
|
||||
'provider_endpoint': this.settings.provider_endpoint,
|
||||
'resource_id': this.settings.resource_id,
|
||||
'text': text,
|
||||
'voice_speaker': voice_speaker,
|
||||
'speed': this.settings.speed,
|
||||
}),
|
||||
});
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
console.error(`HTTP ${response.status}: ${errorText}`);
|
||||
toastr.error(errorText);
|
||||
throw new Error(`HTTP ${response.status}: ${errorText}`);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
}
|
||||
@@ -74,6 +74,8 @@ export const SECRET_KEYS = {
|
||||
SILICONFLOW: 'api_key_siliconflow',
|
||||
ELEVENLABS: 'api_key_elevenlabs',
|
||||
POLLINATIONS: 'api_key_pollinations',
|
||||
VOLCENGINE_APP_ID: 'volcengine_app_id',
|
||||
VOLCENGINE_ACCESS_KEY: 'volcengine_access_key',
|
||||
};
|
||||
|
||||
const FRIENDLY_NAMES = {
|
||||
@@ -136,6 +138,8 @@ const FRIENDLY_NAMES = {
|
||||
[SECRET_KEYS.SILICONFLOW]: 'SiliconFlow',
|
||||
[SECRET_KEYS.ELEVENLABS]: 'ElevenLabs TTS',
|
||||
[SECRET_KEYS.POLLINATIONS]: 'Pollinations',
|
||||
[SECRET_KEYS.VOLCENGINE_APP_ID]: 'Volcengine App ID',
|
||||
[SECRET_KEYS.VOLCENGINE_ACCESS_KEY]: 'Volcengine Access Key',
|
||||
};
|
||||
|
||||
const INPUT_MAP = {
|
||||
|
||||
@@ -67,6 +67,8 @@ export const SECRET_KEYS = {
|
||||
SILICONFLOW: 'api_key_siliconflow',
|
||||
ELEVENLABS: 'api_key_elevenlabs',
|
||||
POLLINATIONS: 'api_key_pollinations',
|
||||
VOLCENGINE_APP_ID: 'volcengine_app_id',
|
||||
VOLCENGINE_ACCESS_KEY: 'volcengine_access_key',
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
import fetch from 'node-fetch';
|
||||
import { Router } from 'express';
|
||||
|
||||
import { readSecret, SECRET_KEYS } from './secrets.js';
|
||||
|
||||
export const router = Router();
|
||||
|
||||
|
||||
router.post('/generate-voice', async (req, res) => {
|
||||
try {
|
||||
let provider_endpoint = req.body.provider_endpoint;
|
||||
if (!provider_endpoint) {
|
||||
console.warn('Volcengine endpoint not set, use default endpoint instead');
|
||||
provider_endpoint = 'https://openspeech.bytedance.com/api/v3/tts/unidirectional';
|
||||
}
|
||||
|
||||
const appId = readSecret(req.user.directories, SECRET_KEYS.VOLCENGINE_APP_ID);
|
||||
const accessKey = readSecret(req.user.directories, SECRET_KEYS.VOLCENGINE_ACCESS_KEY);
|
||||
|
||||
if (!appId || !accessKey) {
|
||||
console.warn('Volcengine generate-voice request missing required parameters appId or accessKey');
|
||||
return res.sendStatus(403);
|
||||
}
|
||||
|
||||
const resourceId = req.body.resource_id;
|
||||
const text = req.body.text;
|
||||
const voice_speaker = req.body.voice_speaker;
|
||||
|
||||
if (!resourceId || !text || !voice_speaker) {
|
||||
console.warn('Volcengine generate-voice request missing required parameters resourceId or text or voice_speaker');
|
||||
return res.sendStatus(400);
|
||||
}
|
||||
|
||||
const response = await fetch(provider_endpoint, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'X-Api-App-Id': appId || '',
|
||||
'X-Api-Access-Key': accessKey || '',
|
||||
'X-Api-Resource-Id': resourceId || '',
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
'req_params': {
|
||||
'text': text,
|
||||
'speaker': voice_speaker,
|
||||
'audio_params': {
|
||||
'format': 'mp3',
|
||||
'speech_rate': Number.parseInt(req.body.speed || '0'),
|
||||
},
|
||||
'additions': JSON.stringify({
|
||||
'mute_cut_threshold': '400',
|
||||
'mute_cut_remain_ms': '1',
|
||||
'explicit_language': 'crosslingual',
|
||||
'enable_language_detector': true,
|
||||
'disable_markdown_filter': true,
|
||||
'cache_config': {
|
||||
'use_cache': true,
|
||||
'text_type': 1,
|
||||
},
|
||||
}),
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const logid = response.headers.get('X-Tt-Logid') || '';
|
||||
console.warn('Volcengine Request failed', response.status, response.statusText, logid);
|
||||
return res.header('X-Tt-Logid', logid).status(500).send(`TTS Generation Failed: ${response.statusText}`);
|
||||
}
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
const result = await new Promise((resolve, reject) => {
|
||||
let audioChunks_ = [];
|
||||
let buffer = '';
|
||||
if (!response.body) {
|
||||
reject(new Error('Response body is null'));
|
||||
return;
|
||||
}
|
||||
response.body.on('data', (chunk) => {
|
||||
buffer += decoder.decode(chunk, { stream: true });
|
||||
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() || '';
|
||||
|
||||
for (const line of lines) {
|
||||
if (!line.trim()) continue;
|
||||
|
||||
try {
|
||||
const { data, code, message } = JSON.parse(line);
|
||||
if (code !== 0 && code !== 20000000) {
|
||||
reject(`Volcengine TTS stream line code ${code}, ${message}`);
|
||||
return;
|
||||
}
|
||||
if (data) {
|
||||
const audioData = Buffer.from(data, 'base64');
|
||||
audioChunks_.push(audioData);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Error parsing Volcengine TTS stream line:', e);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
response.body.on('end', () => {
|
||||
if (buffer.trim()) {
|
||||
try {
|
||||
const { code, data, message } = JSON.parse(buffer);
|
||||
if (code !== 0 && code !== 20000000) {
|
||||
reject(`Volcengine TTS stream line code ${code}, ${message}`);
|
||||
return;
|
||||
}
|
||||
if (data) {
|
||||
const audioData = Buffer.from(data, 'base64');
|
||||
audioChunks_.push(audioData);
|
||||
}
|
||||
} catch (e) {
|
||||
reject(`Error parsing final Volcengine TTS stream line: ${e}`);
|
||||
}
|
||||
}
|
||||
resolve(audioChunks_);
|
||||
});
|
||||
|
||||
response.body.on('error', (error) => {
|
||||
reject(`Error reading Volcengine TTS stream: ${error}`);
|
||||
});
|
||||
});
|
||||
|
||||
const finalAudioData = Buffer.concat(result);
|
||||
|
||||
res.set('Content-Type', 'audio/mpeg');
|
||||
res.status(200).send(finalAudioData);
|
||||
} catch (error) {
|
||||
console.error('Volcengine generate-voice fetch failed', error);
|
||||
res.status(500).send(`TTS Generation Failed: ${error}`);
|
||||
}
|
||||
});
|
||||
@@ -48,6 +48,7 @@ import { router as azureRouter } from './endpoints/azure.js';
|
||||
import { router as minimaxRouter } from './endpoints/minimax.js';
|
||||
import { router as dataMaidRouter } from './endpoints/data-maid.js';
|
||||
import { router as backupsRouter } from './endpoints/backups.js';
|
||||
import { router as volcengineRouter } from './endpoints/volcengine.js';
|
||||
|
||||
/**
|
||||
* @typedef {object} ServerStartupResult
|
||||
@@ -174,6 +175,7 @@ export function setupPrivateEndpoints(app) {
|
||||
app.use('/api/backends/chat-completions', chatCompletionsRouter);
|
||||
app.use('/api/speech', speechRouter);
|
||||
app.use('/api/azure', azureRouter);
|
||||
app.use('/api/volcengine', volcengineRouter);
|
||||
app.use('/api/minimax', minimaxRouter);
|
||||
app.use('/api/data-maid', dataMaidRouter);
|
||||
app.use('/api/backups', backupsRouter);
|
||||
|
||||
Reference in New Issue
Block a user