Add captioning for video attachments (#4749)

* Add captioning for video attachments

* Unify error toast titles

* Add MEDIA_SOURCE enum and update media handling to include source information

* Unify attachment handling logic

* Add error handling for auto-captioning failures

* Use string formatting for console error
This commit is contained in:
Cohee
2025-11-08 02:07:28 +02:00
committed by GitHub
parent 4fcdec47f4
commit 38679897c6
8 changed files with 116 additions and 35 deletions
+82 -27
View File
@@ -10,7 +10,7 @@ import { SlashCommand } from '../../slash-commands/SlashCommand.js';
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js';
import { commonEnumProviders } from '../../slash-commands/SlashCommandCommonEnumsProvider.js';
import { callGenericPopup, Popup, POPUP_TYPE } from '../../popup.js';
import { MEDIA_DISPLAY, MEDIA_TYPE, SCROLL_BEHAVIOR } from '../../constants.js';
import { MEDIA_DISPLAY, MEDIA_SOURCE, MEDIA_TYPE, SCROLL_BEHAVIOR } from '../../constants.js';
export { MODULE_NAME };
const MODULE_NAME = 'caption';
@@ -135,14 +135,18 @@ async function captionExistingMessage(message, mediaIndex) {
const mediaAttachment = message.extra.media[mediaIndex];
if (!mediaAttachment || !mediaAttachment.url || mediaAttachment.type === MEDIA_TYPE.VIDEO) {
if (!mediaAttachment || !mediaAttachment.url || mediaAttachment.type === MEDIA_TYPE.AUDIO) {
return;
}
if (mediaAttachment.type === MEDIA_TYPE.VIDEO && !isVideoCaptioningAvailable()) {
throw new Error('Captioning videos is not supported for the current source.');
}
const imageData = await fetch(mediaAttachment.url);
const blob = await imageData.blob();
const type = imageData.headers.get('Content-Type');
const file = new File([blob], 'image.png', { type });
const fileName = mediaAttachment.url.split('/').pop().split('?')[0] || 'image.jpg';
const file = new File([blob], fileName, { type: blob.type });
const caption = await getCaptionForFile(file, null, true);
if (!caption) {
@@ -158,11 +162,12 @@ async function captionExistingMessage(message, mediaIndex) {
message.extra.inline_image = false;
message.mes = wrappedCaption;
mediaAttachment.title = wrappedCaption;
}
else {
mediaAttachment.captioned = true;
} else {
message.extra.inline_image = true;
mediaAttachment.append_title = true;
mediaAttachment.title = wrappedCaption;
mediaAttachment.captioned = true;
}
}
@@ -170,8 +175,10 @@ async function captionExistingMessage(message, mediaIndex) {
* Sends a captioned message to the chat.
* @param {string} caption Caption text
* @param {string} image Image URL
* @param {string} mimeType Image MIME type
* @returns {Promise<void>}
*/
async function sendCaptionedMessage(caption, image) {
async function sendCaptionedMessage(caption, image, mimeType) {
const messageText = await wrapCaptionTemplate(caption);
const context = getContext();
@@ -179,8 +186,10 @@ async function sendCaptionedMessage(caption, image) {
/** @type {MediaAttachment} */
const mediaAttachment = {
url: image,
type: MEDIA_TYPE.IMAGE,
type: MEDIA_TYPE.getFromMime(mimeType) || MEDIA_TYPE.IMAGE,
title: messageText,
captioned: true,
source: MEDIA_SOURCE.CAPTIONED,
};
/** @type {ChatMessage} */
const message = {
@@ -351,6 +360,10 @@ async function onSelectImage(e, prompt, quiet) {
*/
async function getCaptionForFile(file, prompt, quiet) {
try {
if (file.type.startsWith('video/') && !isVideoCaptioningAvailable()) {
throw new Error('Video captioning is not available for the current source.');
}
setSpinnerIcon();
const context = getContext();
const fileData = await getBase64Async(await ensureImageFormatSupported(file));
@@ -359,13 +372,13 @@ async function getCaptionForFile(file, prompt, quiet) {
const { caption } = await doCaptionRequest(base64Data, fileData, prompt);
if (!quiet) {
const imagePath = await saveBase64AsFile(base64Data, context.name2, '', extension);
await sendCaptionedMessage(caption, imagePath);
await sendCaptionedMessage(caption, imagePath, file.type);
}
return caption;
}
catch (error) {
const errorMessage = error.message || 'Unknown error';
toastr.error(errorMessage, 'Failed to caption image.');
toastr.error(errorMessage, 'Failed to caption');
console.error(error);
return '';
}
@@ -399,13 +412,18 @@ async function captionCommandCallback(args, prompt) {
toastr.error('The specified message does not contain an image.');
return '';
}
if (mediaAttachment.type === MEDIA_TYPE.VIDEO) {
toastr.error('The specified media is a video. Captioning videos is not supported.');
if (mediaAttachment.type === MEDIA_TYPE.AUDIO) {
toastr.error('The specified media is an audio file. Captioning audio files is not supported.');
return '';
}
if (mediaAttachment.type === MEDIA_TYPE.VIDEO && !isVideoCaptioningAvailable()) {
toastr.error('The specified media is a video. Captioning videos is not supported for the current source.');
return '';
}
const fetchResult = await fetch(mediaAttachment.url);
const blob = await fetchResult.blob();
const file = new File([blob], 'image.jpg', { type: blob.type });
const fileName = mediaAttachment.url.split('/').pop().split('?')[0] || 'image.jpg';
const file = new File([blob], fileName, { type: blob.type });
return await getCaptionForFile(file, prompt, quiet);
} catch (error) {
toastr.error('Failed to get image from the message. Make sure the image is accessible.');
@@ -417,7 +435,7 @@ async function captionCommandCallback(args, prompt) {
return new Promise(resolve => {
const input = document.createElement('input');
input.type = 'file';
input.accept = 'image/*';
input.accept = 'image/*,video/*';
input.onchange = async (e) => {
const caption = await onSelectImage(e, prompt, quiet);
resolve(caption);
@@ -427,6 +445,18 @@ async function captionCommandCallback(args, prompt) {
});
}
/**
* Checks if video captioning is available for the current source.
* @returns {boolean} True if video captioning is supported for the current source.
*/
function isVideoCaptioningAvailable() {
if (extension_settings.caption.source !== 'multimodal') {
return false;
}
return ['google', 'vertexai'].includes(extension_settings.caption.multimodal_api);
}
jQuery(async function () {
function addSendPictureButton() {
const sendButton = $(`
@@ -515,13 +545,17 @@ jQuery(async function () {
});
}
function addPictureSendForm() {
const inputHtml = '<input id="img_file" type="file" hidden accept="image/*">';
const imgInput = document.createElement('input');
imgInput.type = 'file';
imgInput.id = 'img_file';
imgInput.accept = 'image/*,video/*';
imgInput.hidden = true;
imgInput.addEventListener('change', (e) => onSelectImage(e, '', false));
const imgForm = document.createElement('form');
imgForm.id = 'img_form';
$(imgForm).append(inputHtml);
$(imgForm).hide();
imgForm.appendChild(imgInput);
imgForm.hidden = true;
$('#form_sheld').append(imgForm);
$('#img_file').on('change', (e) => onSelectImage(e.originalEvent, '', false));
}
async function switchMultimodalBlocks() {
await addRemoteEndpointModels();
@@ -668,13 +702,33 @@ jQuery(async function () {
saveSettingsDebounced();
});
const onMessageEvent = async (messageId) => {
const onMessageEvent = async (/** @type {number} */ messageId) => {
if (!extension_settings.caption.auto_mode) {
return;
}
const message = getContext().chat[messageId];
await captionExistingMessage(message, 0);
if (Array.isArray(message?.extra?.media) && message.extra.media.length > 0) {
for (let mediaIndex = 0; mediaIndex < message.extra.media.length; mediaIndex++) {
const mediaAttachment = message.extra.media[mediaIndex];
if (mediaAttachment.type === MEDIA_TYPE.VIDEO && !isVideoCaptioningAvailable()) {
continue;
}
if (mediaAttachment.type === MEDIA_TYPE.AUDIO) {
continue;
}
// Skip already captioned images and non-uploaded (generated, etc.) images
if (mediaAttachment.source !== MEDIA_SOURCE.UPLOAD || mediaAttachment.captioned) {
continue;
}
try {
await captionExistingMessage(message, mediaIndex);
} catch (e) {
console.error(`Auto-captioning failed for message ID ${messageId}, media index ${mediaIndex}`, e);
continue;
}
}
}
};
eventSource.on(event_types.MESSAGE_SENT, onMessageEvent);
@@ -683,21 +737,22 @@ jQuery(async function () {
$(document).on('click', '.mes_img_caption', async function () {
const animationClass = 'fa-fade';
const messageBlock = $(this).closest('.mes');
const imageBlock = $(this).closest('.mes_img_container');
const messageImg = imageBlock.find('.mes_img');
if (messageImg.hasClass(animationClass)) return;
messageImg.addClass(animationClass);
const mediaContainer = $(this).closest('.mes_media_container');
const messageMedia = mediaContainer.find('.mes_img, .mes_video');
if (messageMedia.hasClass(animationClass)) return;
messageMedia.addClass(animationClass);
try {
const messageId = Number(messageBlock.attr('mesid'));
const imageIndex = Number(imageBlock.attr('data-index'));
const mediaIndex = Number(mediaContainer.attr('data-index'));
const data = getContext().chat[messageId];
await captionExistingMessage(data, imageIndex);
await captionExistingMessage(data, mediaIndex);
appendMediaToMessage(data, messageBlock, SCROLL_BEHAVIOR.KEEP);
await saveChatConditional();
} catch (e) {
console.error('Message image recaption failed', e);
toastr.error(e.message || 'Unknown error', 'Failed to caption');
} finally {
messageImg.removeClass(animationClass);
messageMedia.removeClass(animationClass);
}
});