Volcengine tts (#5003)

* feat(tts): Add support for Volcengine TTS provider

* refactor: Remove the redundant comments in the Volcengine TTS-related code.

* fix(volcengine): Fix the audio data processing logic in the voice generation interface

* feat(tts): Enhance Volcengine TTS functionality and improve error handling

- Return more detailed error information when generating voice fails
- Add multiple preset voice options and support custom voice management
- Reconstruct the audio stream processing logic to enhance reliability
- Improve the UI interface, adding a voice selection dropdown menu and operation buttons

* refactor(tts): Optimize the code structure and error handling of the Volcengine TTS provider
- Remove the unused "voices" array and "model" parameter
- Improve the text processing logic, eliminating unnecessary separators
- Standardize the error handling logic, simplifying the status code checks
- Fix the DOM operation method, using "createElement" instead of string concatenation
- Ensure the existence check of the "customVoices" array

* Fix: Change the Content-Type of the audio response to audio/mpeg.

* Clean-up

---------

Co-authored-by: Cohee <18619528+Cohee1207@users.noreply.github.com>
This commit is contained in:
Crush0
2026-01-27 02:45:38 +08:00
committed by GitHub
parent ef25a03650
commit eaa6a00e97
6 changed files with 462 additions and 0 deletions
+2
View File
@@ -36,6 +36,7 @@ import { PollinationsTtsProvider } from './pollinations.js';
import { MiniMaxTtsProvider } from './minimax.js';
import { ElectronHubTtsProvider } from './electronhub.js';
import { ChutesTtsProvider } from './chutes.js';
import { VolcengineTtsProvider } from './volcengine.js';
const UPDATE_INTERVAL = 1000;
const wrapper = new ModuleWorkerWrapper(moduleWorker);
@@ -146,6 +147,7 @@ const ttsProviders = {
'TTS WebUI': TtsWebuiProvider,
VITS: VITSTtsProvider,
XTTSv2: XTTSTtsProvider,
Volcengine: VolcengineTtsProvider,
};
let ttsProvider;
let ttsProviderName;
+316
View File
@@ -0,0 +1,316 @@
import { event_types, eventSource, getRequestHeaders } from '../../../script.js';
import { SECRET_KEYS, secret_state } from '../../secrets.js';
import { saveTtsProviderSettings, initVoiceMap } from './index.js';
import { Popup } from '../../popup.js';
export { VolcengineTtsProvider };
class VolcengineTtsProvider {
static voices = [
{
name: 'zh_female_xiaohe_uranus_bigtts',
voice_id: 'zh_female_xiaohe_uranus_bigtts',
lang: 'cl',
},
{
name: 'zh_female_vv_uranus_bigtts',
voice_id: 'zh_female_vv_uranus_bigtts',
lang: 'cl',
},
{
name: 'saturn_zh_female_keainvsheng_tob',
voice_id: 'saturn_zh_female_keainvsheng_tob',
lang: 'cl',
},
{
name: 'saturn_zh_female_tiaopigongzhu_tob',
voice_id: 'saturn_zh_female_tiaopigongzhu_tob',
lang: 'cl',
},
{
name: 'saturn_zh_female_cancan_tob',
voice_id: 'saturn_zh_female_cancan_tob',
lang: 'cl',
},
{
name: 'saturn_zh_male_shuanglangshaonian_tob',
voice_id: 'saturn_zh_male_shuanglangshaonian_tob',
lang: 'cl',
},
{
name: 'saturn_zh_male_tiancaitongzhuo_tob',
voice_id: 'saturn_zh_male_tiancaitongzhuo_tob',
lang: 'cl',
},
{
name: 'zh_male_taocheng_uranus_bigtts',
voice_id: 'zh_male_taocheng_uranus_bigtts',
lang: 'cl',
},
];
settings;
audioElement = document.createElement('audio');
defaultSettings = {
voiceMap: {},
customVoices: [],
resource_id: '',
speed: 0,
provider_endpoint: 'https://openspeech.bytedance.com/api/v3/tts/unidirectional',
};
processText(text) {
return text.split('...').join('');
}
constructor() {
this.handler = async function (/** @type {string} */ key) {
if (![SECRET_KEYS.VOLCENGINE_APP_ID, SECRET_KEYS.VOLCENGINE_ACCESS_KEY].includes(key)) return;
$('#volcengine-tts-app-id').toggleClass('success', !!secret_state[SECRET_KEYS.VOLCENGINE_APP_ID]);
$('#volcengine-tts-access-key').toggleClass('success', !!secret_state[SECRET_KEYS.VOLCENGINE_ACCESS_KEY]);
await this.onRefreshClick();
}.bind(this);
}
dispose() {
[event_types.SECRET_WRITTEN, event_types.SECRET_DELETED, event_types.SECRET_ROTATED].forEach(event => {
eventSource.removeListener(event, this.handler);
});
}
async previewTtsVoice(voice) {
const text = 'Hello! Nice to meet you!';
const audio = await this.generateTts(text, voice);
const audioElement = new Audio(URL.createObjectURL(await audio.blob()));
audioElement.play().catch(e => console.error('Error playing audio:', e));
}
async fetchTtsVoiceObjects() {
return this.getAllVoices();
}
get settingsHtml() {
let html = `
<div>Volcengine (Doubao) TTS Configuration.</div>
<small>Hint: Volcengine (Doubao) TTS configuration items.</small>
<small>Please refer to the <a href="https://www.volcengine.com/docs/6561/1598757" target="_blank">documentation</a> to obtain the configuration items.</small>
<div class="flex-container alignItemsCenter">
<div id="volcengine-tts-app-id" class="menu_button menu_button_icon manage-api-keys" data-key="volcengine_app_id">
<i class="fa-solid fa-key"></i>
<span>App ID</span>
</div>
<div id="volcengine-tts-access-key" class="menu_button menu_button_icon manage-api-keys" data-key="volcengine_access_key">
<i class="fa-solid fa-key"></i>
<span>Access Key</span>
</div>
</div>
<div>
<label for="volcengine-tts-resource-id">Resource ID:</label>
<input type="text" class="text_pole" id="volcengine-tts-resource-id">
</div>
<label for="volcengine-tts-voice">Custom Voice (Speaker):</label>
<div class="tts_custom_voices">
<select id="volcengine-tts-voice-select">
</select>
<i title="Add" id="volcengine-tts-add-voice" class="tts-button fa-solid fa-plus fa-xl success" role="button"></i>
<i title="Delete" id="volcengine-tts-delete-voice" class="tts-button fa-solid fa-xmark fa-xl failure" tabindex="0" role="button"></i>
</div>
<div>
<label for="volcengine-tts-speed">Speed:</label>
<div class="flex-container">
<div class="range-block-range">
<input type="range" id="volcengine-tts-speed" min="-50" max="100" step="1">
</div>
<div class="range-block-counter">
<input type="number" min="-50" max="100" step="1" data-for="volcengine-tts-speed" id="volcengine-tts-speed_counter">
</div>
</div>
</div>
<div>
<label for="volcengine-tts-provider-endpoint">Provider Endpoint:</label>
<input type="text" class="text_pole" id="volcengine-tts-provider-endpoint">
</div>
`;
return html;
}
async getVoice(voiceName) {
const allVoices = this.getAllVoices();
return allVoices.find(voice => voice.name == voiceName);
}
getAllVoices() {
const voices = [...VolcengineTtsProvider.voices];
for (const customVoice of this.settings.customVoices) {
voices.push({
name: customVoice,
voice_id: customVoice,
lang: 'cl',
});
}
return voices;
}
populateVoices() {
const voiceSelect = $('#volcengine-tts-voice-select');
voiceSelect.empty();
for (const customVoice of this.settings.customVoices) {
const option = document.createElement('option');
option.value = customVoice;
option.textContent = customVoice;
voiceSelect.append(option);
}
}
async onRefreshClick() {
return await this.checkReady();
}
onSettingsChange() {
// Used when provider settings are updated from UI
this.settings.resource_id = $('#volcengine-tts-resource-id').val();
this.settings.speed = $('#volcengine-tts-speed').val();
this.settings.provider_endpoint = $('#volcengine-tts-provider-endpoint').val();
saveTtsProviderSettings();
this.changeTTSSettings();
}
async changeTTSSettings() {
const speed = this.settings.speed;
$('#volcengine-tts-speed').val(speed);
$('#volcengine-tts-speed_counter').val(speed);
}
async loadSettings(settings) {
// Populate Provider UI given input settings
if (Object.keys(settings).length == 0) {
console.info('Using default TTS Provider settings');
}
// Only accept keys defined in defaultSettings
this.settings = { ...this.defaultSettings };
for (const key in settings) {
if (key in this.settings) {
this.settings[key] = settings[key];
} else {
throw `Invalid setting passed to TTS Provider: ${key}`;
}
}
// Set initial values from the settings
$('#volcengine-tts-resource-id').val(this.settings.resource_id).on('change', this.onSettingsChange.bind(this));
$('#volcengine-tts-add-voice').on('click', this.createNewVoice.bind(this));
$('#volcengine-tts-delete-voice').on('click', this.deleteSelectedVoice.bind(this));
// Ensure custom configuration arrays exist
if (!this.settings.customVoices) this.settings.customVoices = [];
this.populateVoices();
// Speed control - range and number inputs
const speedInput = $('#volcengine-tts-speed');
const speedCounter = $('#volcengine-tts-speed_counter');
speedInput.val(this.settings.speed).on('input change', (e) => {
const value = $(e.target).val();
speedCounter.val(value);
this.settings.speed = value;
saveTtsProviderSettings();
this.changeTTSSettings();
});
speedCounter.val(this.settings.speed).on('input change', (e) => {
const value = $(e.target).val();
speedInput.val(value);
this.settings.speed = value;
saveTtsProviderSettings();
this.changeTTSSettings();
});
$('#volcengine-tts-provider-endpoint').val(this.settings.provider_endpoint).on('change', this.onSettingsChange.bind(this));
// Initialize secret keys UI
$('#volcengine-tts-app-id').toggleClass('success', !!secret_state[SECRET_KEYS.VOLCENGINE_APP_ID]);
$('#volcengine-tts-access-key').toggleClass('success', !!secret_state[SECRET_KEYS.VOLCENGINE_ACCESS_KEY]);
[event_types.SECRET_WRITTEN, event_types.SECRET_DELETED, event_types.SECRET_ROTATED].forEach(event => {
eventSource.on(event, this.handler);
});
await this.checkReady();
console.info('Volcengine TTS: Settings loaded');
}
async createNewVoice() {
const name = await Popup.show.input('Voice name: ', null);
if (!name) {
return;
}
if (this.settings.customVoices.includes(name)) {
toastr.error('Voice name should be unique.');
return;
}
this.settings.customVoices.push(name);
this.populateVoices();
initVoiceMap();
saveTtsProviderSettings();
}
async deleteSelectedVoice() {
const selectedVoiceName = $('#volcengine-tts-voice-select').val();
if (!selectedVoiceName) {
toastr.error('Please select a voice first.');
return;
}
const confirm = await Popup.show.confirm(`Are you sure you want to delete the selected voice ${selectedVoiceName}?`);
if (!confirm) {
return;
}
const voiceIndex = this.settings.customVoices.indexOf(selectedVoiceName);
if (voiceIndex !== -1) {
this.settings.customVoices.splice(voiceIndex, 1);
}
this.populateVoices();
initVoiceMap();
saveTtsProviderSettings();
}
async checkReady() {
await Promise.allSettled([this.changeTTSSettings()]);
}
async generateTts(text, speaker) {
const response = await this.fetchTtsGeneration(text, speaker);
return response;
}
async fetchTtsGeneration(text, voice_speaker) {
console.info(`Generating new TTS for voice_id ${voice_speaker}`);
const response = await fetch('/api/volcengine/generate-voice', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({
'provider_endpoint': this.settings.provider_endpoint,
'resource_id': this.settings.resource_id,
'text': text,
'voice_speaker': voice_speaker,
'speed': this.settings.speed,
}),
});
if (!response.ok) {
const errorText = await response.text();
console.error(`HTTP ${response.status}: ${errorText}`);
toastr.error(errorText);
throw new Error(`HTTP ${response.status}: ${errorText}`);
}
return response;
}
}
+4
View File
@@ -74,6 +74,8 @@ export const SECRET_KEYS = {
SILICONFLOW: 'api_key_siliconflow',
ELEVENLABS: 'api_key_elevenlabs',
POLLINATIONS: 'api_key_pollinations',
VOLCENGINE_APP_ID: 'volcengine_app_id',
VOLCENGINE_ACCESS_KEY: 'volcengine_access_key',
};
const FRIENDLY_NAMES = {
@@ -136,6 +138,8 @@ const FRIENDLY_NAMES = {
[SECRET_KEYS.SILICONFLOW]: 'SiliconFlow',
[SECRET_KEYS.ELEVENLABS]: 'ElevenLabs TTS',
[SECRET_KEYS.POLLINATIONS]: 'Pollinations',
[SECRET_KEYS.VOLCENGINE_APP_ID]: 'Volcengine App ID',
[SECRET_KEYS.VOLCENGINE_ACCESS_KEY]: 'Volcengine Access Key',
};
const INPUT_MAP = {
+2
View File
@@ -67,6 +67,8 @@ export const SECRET_KEYS = {
SILICONFLOW: 'api_key_siliconflow',
ELEVENLABS: 'api_key_elevenlabs',
POLLINATIONS: 'api_key_pollinations',
VOLCENGINE_APP_ID: 'volcengine_app_id',
VOLCENGINE_ACCESS_KEY: 'volcengine_access_key',
};
/**
+136
View File
@@ -0,0 +1,136 @@
import fetch from 'node-fetch';
import { Router } from 'express';
import { readSecret, SECRET_KEYS } from './secrets.js';
export const router = Router();
router.post('/generate-voice', async (req, res) => {
try {
let provider_endpoint = req.body.provider_endpoint;
if (!provider_endpoint) {
console.warn('Volcengine endpoint not set, use default endpoint instead');
provider_endpoint = 'https://openspeech.bytedance.com/api/v3/tts/unidirectional';
}
const appId = readSecret(req.user.directories, SECRET_KEYS.VOLCENGINE_APP_ID);
const accessKey = readSecret(req.user.directories, SECRET_KEYS.VOLCENGINE_ACCESS_KEY);
if (!appId || !accessKey) {
console.warn('Volcengine generate-voice request missing required parameters appId or accessKey');
return res.sendStatus(403);
}
const resourceId = req.body.resource_id;
const text = req.body.text;
const voice_speaker = req.body.voice_speaker;
if (!resourceId || !text || !voice_speaker) {
console.warn('Volcengine generate-voice request missing required parameters resourceId or text or voice_speaker');
return res.sendStatus(400);
}
const response = await fetch(provider_endpoint, {
method: 'POST',
headers: {
'X-Api-App-Id': appId || '',
'X-Api-Access-Key': accessKey || '',
'X-Api-Resource-Id': resourceId || '',
'Content-Type': 'application/json',
},
body: JSON.stringify({
'req_params': {
'text': text,
'speaker': voice_speaker,
'audio_params': {
'format': 'mp3',
'speech_rate': Number.parseInt(req.body.speed || '0'),
},
'additions': JSON.stringify({
'mute_cut_threshold': '400',
'mute_cut_remain_ms': '1',
'explicit_language': 'crosslingual',
'enable_language_detector': true,
'disable_markdown_filter': true,
'cache_config': {
'use_cache': true,
'text_type': 1,
},
}),
},
}),
});
if (!response.ok) {
const logid = response.headers.get('X-Tt-Logid') || '';
console.warn('Volcengine Request failed', response.status, response.statusText, logid);
return res.header('X-Tt-Logid', logid).status(500).send(`TTS Generation Failed: ${response.statusText}`);
}
const decoder = new TextDecoder();
const result = await new Promise((resolve, reject) => {
let audioChunks_ = [];
let buffer = '';
if (!response.body) {
reject(new Error('Response body is null'));
return;
}
response.body.on('data', (chunk) => {
buffer += decoder.decode(chunk, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop() || '';
for (const line of lines) {
if (!line.trim()) continue;
try {
const { data, code, message } = JSON.parse(line);
if (code !== 0 && code !== 20000000) {
reject(`Volcengine TTS stream line code ${code}, ${message}`);
return;
}
if (data) {
const audioData = Buffer.from(data, 'base64');
audioChunks_.push(audioData);
}
} catch (e) {
console.error('Error parsing Volcengine TTS stream line:', e);
}
}
});
response.body.on('end', () => {
if (buffer.trim()) {
try {
const { code, data, message } = JSON.parse(buffer);
if (code !== 0 && code !== 20000000) {
reject(`Volcengine TTS stream line code ${code}, ${message}`);
return;
}
if (data) {
const audioData = Buffer.from(data, 'base64');
audioChunks_.push(audioData);
}
} catch (e) {
reject(`Error parsing final Volcengine TTS stream line: ${e}`);
}
}
resolve(audioChunks_);
});
response.body.on('error', (error) => {
reject(`Error reading Volcengine TTS stream: ${error}`);
});
});
const finalAudioData = Buffer.concat(result);
res.set('Content-Type', 'audio/mpeg');
res.status(200).send(finalAudioData);
} catch (error) {
console.error('Volcengine generate-voice fetch failed', error);
res.status(500).send(`TTS Generation Failed: ${error}`);
}
});
+2
View File
@@ -48,6 +48,7 @@ import { router as azureRouter } from './endpoints/azure.js';
import { router as minimaxRouter } from './endpoints/minimax.js';
import { router as dataMaidRouter } from './endpoints/data-maid.js';
import { router as backupsRouter } from './endpoints/backups.js';
import { router as volcengineRouter } from './endpoints/volcengine.js';
/**
* @typedef {object} ServerStartupResult
@@ -174,6 +175,7 @@ export function setupPrivateEndpoints(app) {
app.use('/api/backends/chat-completions', chatCompletionsRouter);
app.use('/api/speech', speechRouter);
app.use('/api/azure', azureRouter);
app.use('/api/volcengine', volcengineRouter);
app.use('/api/minimax', minimaxRouter);
app.use('/api/data-maid', dataMaidRouter);
app.use('/api/backups', backupsRouter);