From a78dfe9172c6442667818076eae45c9d52b9daa6 Mon Sep 17 00:00:00 2001 From: Edward Ly Date: Tue, 16 Jun 2026 18:04:22 -0700 Subject: [PATCH 1/3] feat: implement generate subtitles provider Signed-off-by: Edward Ly --- lib/AppInfo/Application.php | 8 + lib/Service/OpenAiAPIService.php | 37 +++- .../AudioToTextSubtitlesProvider.php | 158 ++++++++++++++++++ 3 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 lib/TaskProcessing/AudioToTextSubtitlesProvider.php diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index 80b30dc2..107f9332 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -15,6 +15,7 @@ use OCA\OpenAi\TaskProcessing\AudioToAudioTranslateTaskType; use OCA\OpenAi\TaskProcessing\AudioToTextEnhancedProvider; use OCA\OpenAi\TaskProcessing\AudioToTextProvider; +use OCA\OpenAi\TaskProcessing\AudioToTextSubtitlesProvider; use OCA\OpenAi\TaskProcessing\ChangeToneProvider; use OCA\OpenAi\TaskProcessing\ContextWriteProvider; use OCA\OpenAi\TaskProcessing\EmojiProvider; @@ -51,6 +52,10 @@ class Application extends App implements IBootstrap { 'alloy', 'ash', 'ballad', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer', 'verse' ]; + public const DEFAULT_SUBTITLE_FORMAT = 'srt'; + public const DEFAULT_SUBTITLE_FORMATS = [ + 'srt', 'vtt' + ]; public const DEFAULT_DEFAULT_IMAGE_SIZE = '1024x1024'; public const MAX_GENERATION_IDLE_TIME = 60 * 60 * 24 * 10; public const DEFAULT_CHUNK_SIZE = 10000; @@ -116,6 +121,9 @@ public function register(IRegistrationContext $context): void { } if ($sttProviderEnabled) { $context->registerTaskProcessingProvider(AudioToTextProvider::class); + if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToTextSubtitles')) { + $context->registerTaskProcessingProvider(AudioToTextSubtitlesProvider::class); + } if (class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToTextReformatParagraphs')) { $context->registerTaskProcessingProvider(AudioToTextEnhancedProvider::class); } diff --git a/lib/Service/OpenAiAPIService.php b/lib/Service/OpenAiAPIService.php index bc798a68..1cca0094 100644 --- a/lib/Service/OpenAiAPIService.php +++ b/lib/Service/OpenAiAPIService.php @@ -865,6 +865,7 @@ public function transcribeBase64Mp3( * @param bool $translate * @param string $model * @param string $language + * @param string $responseFormat * @return string * @throws Exception */ @@ -874,9 +875,10 @@ public function transcribeFile( bool $translate = false, string $model = Application::DEFAULT_MODEL_ID, string $language = 'default', + string $responseFormat = 'verbose_json', ): string { try { - $transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language); + $transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language, $responseFormat); } catch (NotPermittedException|LockedException|GenericFileException $e) { $this->logger->warning('Could not read audio file: ' . $file->getPath() . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]); throw new Exception($this->l10n->t('Could not read audio file.'), Http::STATUS_INTERNAL_SERVER_ERROR); @@ -891,6 +893,7 @@ public function transcribeFile( * @param bool $translate * @param string $model * @param string $language + * @param string $responseFormat * @return string * @throws Exception */ @@ -900,6 +903,7 @@ public function transcribe( bool $translate = true, string $model = Application::DEFAULT_MODEL_ID, string $language = 'default', + string $responseFormat = 'verbose_json', // Verbose needed for extraction of audio duration ): string { if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TRANSCRIPTION)) { throw new Exception($this->l10n->t('Audio transcription quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS); @@ -912,8 +916,7 @@ public function transcribe( $params = [ 'model' => $model === Application::DEFAULT_MODEL_ID ? Application::DEFAULT_TRANSCRIPTION_MODEL_ID : $model, 'file' => $audioFileContent, - 'response_format' => 'verbose_json', - // Verbose needed for extraction of audio duration + 'response_format' => $responseFormat, ]; // Gets the user's preferred language if it's not the default one if ($language === 'default') { @@ -927,6 +930,34 @@ public function transcribe( $response = $this->request($userId, $endpoint, $params, 'POST', $contentType, serviceType: Application::SERVICE_TYPE_STT); + if (in_array($responseFormat, Application::DEFAULT_SUBTITLE_FORMATS)) { + if (!isset($response['body'])) { + $this->logger->warning('Audio subtitling error: ' . json_encode($response)); + throw new Exception($this->l10n->t('Unknown audio subtitling error'), Http::STATUS_INTERNAL_SERVER_ERROR); + } + + // Extract audio duration from response and store it as quota usage: + $matches = []; + $isMatch = preg_match_all('/(\d\d):(\d\d):(\d\d)[\.,](\d\d\d)/', $response['body'], $matches, PREG_SET_ORDER); + + if ($isMatch !== false && $isMatch > 0) { + $lastTimestamp = end($matches); + $hours = intval($lastTimestamp[1]); + $minutes = intval($lastTimestamp[2]); + $seconds = intval($lastTimestamp[3]); + $millisecondAdjustment = intval(round(floatval($lastTimestamp[4]) / 1000.0)); + $audioDuration = ($hours * 3600) + ($minutes * 60) + $seconds + $millisecondAdjustment; + + try { + $this->createQuotaUsage($userId ?? '', Application::QUOTA_TYPE_TRANSCRIPTION, $audioDuration); + } catch (DBException $e) { + $this->logger->warning('Could not create quota usage for user: ' . $userId . ' and quota type: ' . Application::QUOTA_TYPE_TRANSCRIPTION . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]); + } + } + + return $response['body']; + } + if (!isset($response['text'])) { $this->logger->warning('Audio transcription error: ' . json_encode($response)); throw new Exception($this->l10n->t('Unknown audio trancription error'), Http::STATUS_INTERNAL_SERVER_ERROR); diff --git a/lib/TaskProcessing/AudioToTextSubtitlesProvider.php b/lib/TaskProcessing/AudioToTextSubtitlesProvider.php new file mode 100644 index 00000000..5a21ae23 --- /dev/null +++ b/lib/TaskProcessing/AudioToTextSubtitlesProvider.php @@ -0,0 +1,158 @@ +openAiAPIService->getServiceName(Application::SERVICE_TYPE_STT); + } + + public function getTaskTypeId(): string { + return AudioToTextSubtitles::ID; + } + + public function getExpectedRuntime(): int { + return $this->openAiAPIService->getExpTextProcessingTime(); + } + + public function getInputShapeEnumValues(): array { + return []; + } + + public function getInputShapeDefaults(): array { + return []; + } + + public function getOptionalInputShape(): array { + return [ + 'language' => new ShapeDescriptor( + $this->l->t('Language'), + $this->l->t('The language of the audio file'), + EShapeType::Enum + ), + 'format' => new ShapeDescriptor( + $this->l->t('File format'), + $this->l->t('The format of the subtitles file'), + EShapeType::Enum + ), + ]; + } + + public function getOptionalInputShapeEnumValues(): array { + $languageEnumValues = array_map(static function (array $language) { + return new ShapeEnumValue($language[1], $language[0]); + }, Application::LANGUAGE_CODES_AND_ENDONYMS); + $detectLanguageEnumValue = new ShapeEnumValue($this->l->t('Detect language'), 'detect_language'); + $defaultLanguageEnumValue = new ShapeEnumValue($this->l->t('Default'), 'default'); + return [ + 'language' => array_merge([$detectLanguageEnumValue, $defaultLanguageEnumValue], $languageEnumValues), + 'format' => [ + new ShapeEnumValue($this->l->t('SubRip Text'), 'srt'), + new ShapeEnumValue($this->l->t('WebVTT'), 'vtt'), + ], + ]; + } + + public function getOptionalInputShapeDefaults(): array { + return [ + 'language' => 'default', + 'format' => 'srt', + ]; + } + + public function getOutputShapeEnumValues(): array { + return []; + } + + public function getOptionalOutputShape(): array { + return []; + } + + public function getOptionalOutputShapeEnumValues(): array { + return []; + } + + public function process(?string $userId, array $input, callable $reportProgress): array { + if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) { + throw new ProcessingException('Invalid input file'); + } + + $fileSize = intval($input['input']->getSize()); + // Maximum file size for OpenAI is 25MB. (https://developers.openai.com/api/docs/guides/speech-to-text) + if ($fileSize > 25 * 1000 * 1000) { + throw new ProcessingException('Filesize of input too large. Max is 25MB'); + } + + $fileType = $input['input']->getMimeType(); + if (!str_starts_with($fileType, 'audio/')) { + throw new ProcessingException('Invalid input file type ' . $fileType); + } + if ($this->openAiAPIService->isUsingOpenAi()) { + $validFileTypes = [ + 'audio/mp3', + 'audio/mp4', + 'audio/mpeg', + 'audio/mpga', + 'audio/m4a', + 'audio/wav', + 'audio/webm', + ]; + if (!in_array($fileType, $validFileTypes)) { + throw new ProcessingException('Invalid input file type for OpenAI ' . $fileType); + } + } + + $inputFile = $input['input']; + $format = $input['format']; + $language = $input['language'] ?? 'default'; + if (!is_string($language)) { + throw new ProcessingException('Invalid language'); + } + + $model = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID, lazy: true) ?: Application::DEFAULT_MODEL_ID; + + try { + $transcription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $model, $language, $format); + return ['output' => $transcription]; + } catch (UserFacingProcessingException $e) { + throw $e; + } catch (\Throwable $e) { + $this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new ProcessingException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage()); + } + } +} From 7e06c3b21132d1a2169fb9edb1aa4d905468e115 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Wed, 24 Jun 2026 14:27:12 +0200 Subject: [PATCH 2/3] fix: ignore psalm error about missing AudioToTextSubtitles class, it will get in nextcloud/ocp soon Signed-off-by: Julien Veyssier --- psalm.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/psalm.xml b/psalm.xml index 021f01c9..a3832f2d 100644 --- a/psalm.xml +++ b/psalm.xml @@ -48,6 +48,7 @@ + From 00f8dcc0426bbe351de78cee26fe2e8c240a3152 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Thu, 25 Jun 2026 10:56:43 +0200 Subject: [PATCH 3/3] feat: add user facing errors in AudioToTextSubtitles provider Signed-off-by: Julien Veyssier --- .../AudioToTextSubtitlesProvider.php | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/TaskProcessing/AudioToTextSubtitlesProvider.php b/lib/TaskProcessing/AudioToTextSubtitlesProvider.php index 5a21ae23..864e87c7 100644 --- a/lib/TaskProcessing/AudioToTextSubtitlesProvider.php +++ b/lib/TaskProcessing/AudioToTextSubtitlesProvider.php @@ -114,12 +114,22 @@ public function process(?string $userId, array $input, callable $reportProgress) $fileSize = intval($input['input']->getSize()); // Maximum file size for OpenAI is 25MB. (https://developers.openai.com/api/docs/guides/speech-to-text) if ($fileSize > 25 * 1000 * 1000) { - throw new ProcessingException('Filesize of input too large. Max is 25MB'); + throw new UserFacingProcessingException( + 'Filesize of input is too large. Max is 25MB', + 0, + null, + $this->l->t('The input file size is too large. A maximum of 25MB is allowed.'), + ); } $fileType = $input['input']->getMimeType(); if (!str_starts_with($fileType, 'audio/')) { - throw new ProcessingException('Invalid input file type ' . $fileType); + throw new UserFacingProcessingException( + 'Invalid input file type ' . $fileType, + 0, + null, + $this->l->t('The input file type is invalid. Only audio files are allowed.'), + ); } if ($this->openAiAPIService->isUsingOpenAi()) { $validFileTypes = [