Add captioning for video attachments (#4749)
* Add captioning for video attachments * Unify error toast titles * Add MEDIA_SOURCE enum and update media handling to include source information * Unify attachment handling logic * Add error handling for auto-captioning failures * Use string formatting for console error
This commit is contained in:
@@ -10,7 +10,7 @@ import { SlashCommand } from '../../slash-commands/SlashCommand.js';
|
||||
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js';
|
||||
import { commonEnumProviders } from '../../slash-commands/SlashCommandCommonEnumsProvider.js';
|
||||
import { callGenericPopup, Popup, POPUP_TYPE } from '../../popup.js';
|
||||
import { MEDIA_DISPLAY, MEDIA_TYPE, SCROLL_BEHAVIOR } from '../../constants.js';
|
||||
import { MEDIA_DISPLAY, MEDIA_SOURCE, MEDIA_TYPE, SCROLL_BEHAVIOR } from '../../constants.js';
|
||||
export { MODULE_NAME };
|
||||
|
||||
const MODULE_NAME = 'caption';
|
||||
@@ -135,14 +135,18 @@ async function captionExistingMessage(message, mediaIndex) {
|
||||
|
||||
const mediaAttachment = message.extra.media[mediaIndex];
|
||||
|
||||
if (!mediaAttachment || !mediaAttachment.url || mediaAttachment.type === MEDIA_TYPE.VIDEO) {
|
||||
if (!mediaAttachment || !mediaAttachment.url || mediaAttachment.type === MEDIA_TYPE.AUDIO) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (mediaAttachment.type === MEDIA_TYPE.VIDEO && !isVideoCaptioningAvailable()) {
|
||||
throw new Error('Captioning videos is not supported for the current source.');
|
||||
}
|
||||
|
||||
const imageData = await fetch(mediaAttachment.url);
|
||||
const blob = await imageData.blob();
|
||||
const type = imageData.headers.get('Content-Type');
|
||||
const file = new File([blob], 'image.png', { type });
|
||||
const fileName = mediaAttachment.url.split('/').pop().split('?')[0] || 'image.jpg';
|
||||
const file = new File([blob], fileName, { type: blob.type });
|
||||
const caption = await getCaptionForFile(file, null, true);
|
||||
|
||||
if (!caption) {
|
||||
@@ -158,11 +162,12 @@ async function captionExistingMessage(message, mediaIndex) {
|
||||
message.extra.inline_image = false;
|
||||
message.mes = wrappedCaption;
|
||||
mediaAttachment.title = wrappedCaption;
|
||||
}
|
||||
else {
|
||||
mediaAttachment.captioned = true;
|
||||
} else {
|
||||
message.extra.inline_image = true;
|
||||
mediaAttachment.append_title = true;
|
||||
mediaAttachment.title = wrappedCaption;
|
||||
mediaAttachment.captioned = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,8 +175,10 @@ async function captionExistingMessage(message, mediaIndex) {
|
||||
* Sends a captioned message to the chat.
|
||||
* @param {string} caption Caption text
|
||||
* @param {string} image Image URL
|
||||
* @param {string} mimeType Image MIME type
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async function sendCaptionedMessage(caption, image) {
|
||||
async function sendCaptionedMessage(caption, image, mimeType) {
|
||||
const messageText = await wrapCaptionTemplate(caption);
|
||||
|
||||
const context = getContext();
|
||||
@@ -179,8 +186,10 @@ async function sendCaptionedMessage(caption, image) {
|
||||
/** @type {MediaAttachment} */
|
||||
const mediaAttachment = {
|
||||
url: image,
|
||||
type: MEDIA_TYPE.IMAGE,
|
||||
type: MEDIA_TYPE.getFromMime(mimeType) || MEDIA_TYPE.IMAGE,
|
||||
title: messageText,
|
||||
captioned: true,
|
||||
source: MEDIA_SOURCE.CAPTIONED,
|
||||
};
|
||||
/** @type {ChatMessage} */
|
||||
const message = {
|
||||
@@ -351,6 +360,10 @@ async function onSelectImage(e, prompt, quiet) {
|
||||
*/
|
||||
async function getCaptionForFile(file, prompt, quiet) {
|
||||
try {
|
||||
if (file.type.startsWith('video/') && !isVideoCaptioningAvailable()) {
|
||||
throw new Error('Video captioning is not available for the current source.');
|
||||
}
|
||||
|
||||
setSpinnerIcon();
|
||||
const context = getContext();
|
||||
const fileData = await getBase64Async(await ensureImageFormatSupported(file));
|
||||
@@ -359,13 +372,13 @@ async function getCaptionForFile(file, prompt, quiet) {
|
||||
const { caption } = await doCaptionRequest(base64Data, fileData, prompt);
|
||||
if (!quiet) {
|
||||
const imagePath = await saveBase64AsFile(base64Data, context.name2, '', extension);
|
||||
await sendCaptionedMessage(caption, imagePath);
|
||||
await sendCaptionedMessage(caption, imagePath, file.type);
|
||||
}
|
||||
return caption;
|
||||
}
|
||||
catch (error) {
|
||||
const errorMessage = error.message || 'Unknown error';
|
||||
toastr.error(errorMessage, 'Failed to caption image.');
|
||||
toastr.error(errorMessage, 'Failed to caption');
|
||||
console.error(error);
|
||||
return '';
|
||||
}
|
||||
@@ -399,13 +412,18 @@ async function captionCommandCallback(args, prompt) {
|
||||
toastr.error('The specified message does not contain an image.');
|
||||
return '';
|
||||
}
|
||||
if (mediaAttachment.type === MEDIA_TYPE.VIDEO) {
|
||||
toastr.error('The specified media is a video. Captioning videos is not supported.');
|
||||
if (mediaAttachment.type === MEDIA_TYPE.AUDIO) {
|
||||
toastr.error('The specified media is an audio file. Captioning audio files is not supported.');
|
||||
return '';
|
||||
}
|
||||
if (mediaAttachment.type === MEDIA_TYPE.VIDEO && !isVideoCaptioningAvailable()) {
|
||||
toastr.error('The specified media is a video. Captioning videos is not supported for the current source.');
|
||||
return '';
|
||||
}
|
||||
const fetchResult = await fetch(mediaAttachment.url);
|
||||
const blob = await fetchResult.blob();
|
||||
const file = new File([blob], 'image.jpg', { type: blob.type });
|
||||
const fileName = mediaAttachment.url.split('/').pop().split('?')[0] || 'image.jpg';
|
||||
const file = new File([blob], fileName, { type: blob.type });
|
||||
return await getCaptionForFile(file, prompt, quiet);
|
||||
} catch (error) {
|
||||
toastr.error('Failed to get image from the message. Make sure the image is accessible.');
|
||||
@@ -417,7 +435,7 @@ async function captionCommandCallback(args, prompt) {
|
||||
return new Promise(resolve => {
|
||||
const input = document.createElement('input');
|
||||
input.type = 'file';
|
||||
input.accept = 'image/*';
|
||||
input.accept = 'image/*,video/*';
|
||||
input.onchange = async (e) => {
|
||||
const caption = await onSelectImage(e, prompt, quiet);
|
||||
resolve(caption);
|
||||
@@ -427,6 +445,18 @@ async function captionCommandCallback(args, prompt) {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if video captioning is available for the current source.
|
||||
* @returns {boolean} True if video captioning is supported for the current source.
|
||||
*/
|
||||
function isVideoCaptioningAvailable() {
|
||||
if (extension_settings.caption.source !== 'multimodal') {
|
||||
return false;
|
||||
}
|
||||
|
||||
return ['google', 'vertexai'].includes(extension_settings.caption.multimodal_api);
|
||||
}
|
||||
|
||||
jQuery(async function () {
|
||||
function addSendPictureButton() {
|
||||
const sendButton = $(`
|
||||
@@ -515,13 +545,17 @@ jQuery(async function () {
|
||||
});
|
||||
}
|
||||
function addPictureSendForm() {
|
||||
const inputHtml = '<input id="img_file" type="file" hidden accept="image/*">';
|
||||
const imgInput = document.createElement('input');
|
||||
imgInput.type = 'file';
|
||||
imgInput.id = 'img_file';
|
||||
imgInput.accept = 'image/*,video/*';
|
||||
imgInput.hidden = true;
|
||||
imgInput.addEventListener('change', (e) => onSelectImage(e, '', false));
|
||||
const imgForm = document.createElement('form');
|
||||
imgForm.id = 'img_form';
|
||||
$(imgForm).append(inputHtml);
|
||||
$(imgForm).hide();
|
||||
imgForm.appendChild(imgInput);
|
||||
imgForm.hidden = true;
|
||||
$('#form_sheld').append(imgForm);
|
||||
$('#img_file').on('change', (e) => onSelectImage(e.originalEvent, '', false));
|
||||
}
|
||||
async function switchMultimodalBlocks() {
|
||||
await addRemoteEndpointModels();
|
||||
@@ -668,13 +702,33 @@ jQuery(async function () {
|
||||
saveSettingsDebounced();
|
||||
});
|
||||
|
||||
const onMessageEvent = async (messageId) => {
|
||||
const onMessageEvent = async (/** @type {number} */ messageId) => {
|
||||
if (!extension_settings.caption.auto_mode) {
|
||||
return;
|
||||
}
|
||||
|
||||
const message = getContext().chat[messageId];
|
||||
await captionExistingMessage(message, 0);
|
||||
if (Array.isArray(message?.extra?.media) && message.extra.media.length > 0) {
|
||||
for (let mediaIndex = 0; mediaIndex < message.extra.media.length; mediaIndex++) {
|
||||
const mediaAttachment = message.extra.media[mediaIndex];
|
||||
if (mediaAttachment.type === MEDIA_TYPE.VIDEO && !isVideoCaptioningAvailable()) {
|
||||
continue;
|
||||
}
|
||||
if (mediaAttachment.type === MEDIA_TYPE.AUDIO) {
|
||||
continue;
|
||||
}
|
||||
// Skip already captioned images and non-uploaded (generated, etc.) images
|
||||
if (mediaAttachment.source !== MEDIA_SOURCE.UPLOAD || mediaAttachment.captioned) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
await captionExistingMessage(message, mediaIndex);
|
||||
} catch (e) {
|
||||
console.error(`Auto-captioning failed for message ID ${messageId}, media index ${mediaIndex}`, e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
eventSource.on(event_types.MESSAGE_SENT, onMessageEvent);
|
||||
@@ -683,21 +737,22 @@ jQuery(async function () {
|
||||
$(document).on('click', '.mes_img_caption', async function () {
|
||||
const animationClass = 'fa-fade';
|
||||
const messageBlock = $(this).closest('.mes');
|
||||
const imageBlock = $(this).closest('.mes_img_container');
|
||||
const messageImg = imageBlock.find('.mes_img');
|
||||
if (messageImg.hasClass(animationClass)) return;
|
||||
messageImg.addClass(animationClass);
|
||||
const mediaContainer = $(this).closest('.mes_media_container');
|
||||
const messageMedia = mediaContainer.find('.mes_img, .mes_video');
|
||||
if (messageMedia.hasClass(animationClass)) return;
|
||||
messageMedia.addClass(animationClass);
|
||||
try {
|
||||
const messageId = Number(messageBlock.attr('mesid'));
|
||||
const imageIndex = Number(imageBlock.attr('data-index'));
|
||||
const mediaIndex = Number(mediaContainer.attr('data-index'));
|
||||
const data = getContext().chat[messageId];
|
||||
await captionExistingMessage(data, imageIndex);
|
||||
await captionExistingMessage(data, mediaIndex);
|
||||
appendMediaToMessage(data, messageBlock, SCROLL_BEHAVIOR.KEEP);
|
||||
await saveChatConditional();
|
||||
} catch (e) {
|
||||
console.error('Message image recaption failed', e);
|
||||
toastr.error(e.message || 'Unknown error', 'Failed to caption');
|
||||
} finally {
|
||||
messageImg.removeClass(animationClass);
|
||||
messageMedia.removeClass(animationClass);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user