nextcloud · edward-ly · Jun 17, 2026 · Jun 24, 2026 · Jun 25, 2026 · kyteinsky
diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
@@ -15,6 +15,7 @@
 use OCA\OpenAi\TaskProcessing\AudioToAudioTranslateTaskType;
 use OCA\OpenAi\TaskProcessing\AudioToTextEnhancedProvider;
 use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
+use OCA\OpenAi\TaskProcessing\AudioToTextSubtitlesProvider;
 use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
 use OCA\OpenAi\TaskProcessing\ContextWriteProvider;
 use OCA\OpenAi\TaskProcessing\EmojiProvider;
@@ -51,6 +52,10 @@ class Application extends App implements IBootstrap {
 		'alloy', 'ash', 'ballad', 'coral', 'echo', 'fable',
 		'onyx', 'nova', 'sage', 'shimmer', 'verse'
 	];
+	public const DEFAULT_SUBTITLE_FORMAT = 'srt';
+	public const DEFAULT_SUBTITLE_FORMATS = [
+		'srt', 'vtt'
+	];
 	public const DEFAULT_DEFAULT_IMAGE_SIZE = '1024x1024';
 	public const MAX_GENERATION_IDLE_TIME = 60 * 60 * 24 * 10;
 	public const DEFAULT_CHUNK_SIZE = 10000;
@@ -116,6 +121,9 @@ public function register(IRegistrationContext $context): void {
 		}
 		if ($sttProviderEnabled) {
 			$context->registerTaskProcessingProvider(AudioToTextProvider::class);
+			if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToTextSubtitles')) {
+				$context->registerTaskProcessingProvider(AudioToTextSubtitlesProvider::class);
+			}
 			if (class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToTextReformatParagraphs')) {
 				$context->registerTaskProcessingProvider(AudioToTextEnhancedProvider::class);
 			}

diff --git a/lib/Service/OpenAiAPIService.php b/lib/Service/OpenAiAPIService.php
@@ -865,6 +865,7 @@ public function transcribeBase64Mp3(
 	 * @param bool $translate
 	 * @param string $model
 	 * @param string $language
+	 * @param string $responseFormat
 	 * @return string
 	 * @throws Exception
 	 */
@@ -874,9 +875,10 @@ public function transcribeFile(
 		bool $translate = false,
 		string $model = Application::DEFAULT_MODEL_ID,
 		string $language = 'default',
+		string $responseFormat = 'verbose_json',
 	): string {
 		try {
-			$transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language);
+			$transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language, $responseFormat);
 		} catch (NotPermittedException|LockedException|GenericFileException $e) {
 			$this->logger->warning('Could not read audio file: ' . $file->getPath() . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]);
 			throw new Exception($this->l10n->t('Could not read audio file.'), Http::STATUS_INTERNAL_SERVER_ERROR);
@@ -891,6 +893,7 @@ public function transcribeFile(
 	 * @param bool $translate
 	 * @param string $model
 	 * @param string $language
+	 * @param string $responseFormat
 	 * @return string
 	 * @throws Exception
 	 */
@@ -900,6 +903,7 @@ public function transcribe(
 		bool $translate = true,
 		string $model = Application::DEFAULT_MODEL_ID,
 		string $language = 'default',
+		string $responseFormat = 'verbose_json', // Verbose needed for extraction of audio duration
 	): string {
 		if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TRANSCRIPTION)) {
 			throw new Exception($this->l10n->t('Audio transcription quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
@@ -912,8 +916,7 @@ public function transcribe(
 		$params = [
 			'model' => $model === Application::DEFAULT_MODEL_ID ? Application::DEFAULT_TRANSCRIPTION_MODEL_ID : $model,
 			'file' => $audioFileContent,
-			'response_format' => 'verbose_json',
-			// Verbose needed for extraction of audio duration
+			'response_format' => $responseFormat,
 		];
 		// Gets the user's preferred language if it's not the default one
 		if ($language === 'default') {
@@ -927,6 +930,34 @@ public function transcribe(
 
 		$response = $this->request($userId, $endpoint, $params, 'POST', $contentType, serviceType: Application::SERVICE_TYPE_STT);
 
+		if (in_array($responseFormat, Application::DEFAULT_SUBTITLE_FORMATS)) {
+			if (!isset($response['body'])) {
+				$this->logger->warning('Audio subtitling error: ' . json_encode($response));
+				throw new Exception($this->l10n->t('Unknown audio subtitling error'), Http::STATUS_INTERNAL_SERVER_ERROR);
+			}
+
+			// Extract audio duration from response and store it as quota usage:
+			$matches = [];
+			$isMatch = preg_match_all('/(\d\d):(\d\d):(\d\d)[\.,](\d\d\d)/', $response['body'], $matches, PREG_SET_ORDER);
+
+			if ($isMatch !== false && $isMatch > 0) {
+				$lastTimestamp = end($matches);
+				$hours = intval($lastTimestamp[1]);
+				$minutes = intval($lastTimestamp[2]);
+				$seconds = intval($lastTimestamp[3]);
+				$millisecondAdjustment = intval(round(floatval($lastTimestamp[4]) / 1000.0));
+				$audioDuration = ($hours * 3600) + ($minutes * 60) + $seconds + $millisecondAdjustment;
+
+				try {
+					$this->createQuotaUsage($userId ?? '', Application::QUOTA_TYPE_TRANSCRIPTION, $audioDuration);
+				} catch (DBException $e) {
+					$this->logger->warning('Could not create quota usage for user: ' . $userId . ' and quota type: ' . Application::QUOTA_TYPE_TRANSCRIPTION . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]);
+				}
+			}
+
+			return $response['body'];
+		}
+
 		if (!isset($response['text'])) {
 			$this->logger->warning('Audio transcription error: ' . json_encode($response));
 			throw new Exception($this->l10n->t('Unknown audio trancription error'), Http::STATUS_INTERNAL_SERVER_ERROR);

diff --git a/lib/TaskProcessing/AudioToTextSubtitlesProvider.php b/lib/TaskProcessing/AudioToTextSubtitlesProvider.php
@@ -0,0 +1,168 @@
+<?php
+
+declare(strict_types=1);
+
+/**
+ * SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace OCA\OpenAi\TaskProcessing;
+
+use OCA\OpenAi\AppInfo\Application;
+use OCA\OpenAi\Service\OpenAiAPIService;
+use OCP\Files\File;
+use OCP\IAppConfig;
+use OCP\IL10N;
+use OCP\TaskProcessing\EShapeType;
+use OCP\TaskProcessing\Exception\ProcessingException;
+use OCP\TaskProcessing\Exception\UserFacingProcessingException;
+use OCP\TaskProcessing\ISynchronousProvider;
+use OCP\TaskProcessing\ShapeDescriptor;
+use OCP\TaskProcessing\ShapeEnumValue;
+use OCP\TaskProcessing\TaskTypes\AudioToTextSubtitles;
+use Psr\Log\LoggerInterface;
+
+class AudioToTextSubtitlesProvider implements ISynchronousProvider {
+
+	public function __construct(
+		private OpenAiAPIService $openAiAPIService,
+		private LoggerInterface $logger,
+		private IAppConfig $appConfig,
+		private IL10N $l,
+	) {
+	}
+
+	public function getId(): string {
+		return Application::APP_ID . '-audio2text-subtitles';
+	}
+
+	public function getName(): string {
+		return $this->openAiAPIService->getServiceName(Application::SERVICE_TYPE_STT);
+	}
+
+	public function getTaskTypeId(): string {
+		return AudioToTextSubtitles::ID;
+	}
+
+	public function getExpectedRuntime(): int {
+		return $this->openAiAPIService->getExpTextProcessingTime();
+	}
+
+	public function getInputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function getInputShapeDefaults(): array {
+		return [];
+	}
+
+	public function getOptionalInputShape(): array {
+		return [
+			'language' => new ShapeDescriptor(
+				$this->l->t('Language'),
+				$this->l->t('The language of the audio file'),
+				EShapeType::Enum
+			),
+			'format' => new ShapeDescriptor(
+				$this->l->t('File format'),
+				$this->l->t('The format of the subtitles file'),
+				EShapeType::Enum
+			),
+		];
+	}
+
+	public function getOptionalInputShapeEnumValues(): array {
+		$languageEnumValues = array_map(static function (array $language) {
+			return new ShapeEnumValue($language[1], $language[0]);
+		}, Application::LANGUAGE_CODES_AND_ENDONYMS);
+		$detectLanguageEnumValue = new ShapeEnumValue($this->l->t('Detect language'), 'detect_language');
+		$defaultLanguageEnumValue = new ShapeEnumValue($this->l->t('Default'), 'default');
+		return [
+			'language' => array_merge([$detectLanguageEnumValue, $defaultLanguageEnumValue], $languageEnumValues),
+			'format' => [
+				new ShapeEnumValue($this->l->t('SubRip Text'), 'srt'),
+				new ShapeEnumValue($this->l->t('WebVTT'), 'vtt'),
+			],
+		];
+	}
+
+	public function getOptionalInputShapeDefaults(): array {
+		return [
+			'language' => 'default',
+			'format' => 'srt',
+		];
+	}
+
+	public function getOutputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function getOptionalOutputShape(): array {
+		return [];
+	}
+
+	public function getOptionalOutputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function process(?string $userId, array $input, callable $reportProgress): array {
+		if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) {
+			throw new ProcessingException('Invalid input file');
+		}
+
+		$fileSize = intval($input['input']->getSize());
+		// Maximum file size for OpenAI is 25MB. (https://developers.openai.com/api/docs/guides/speech-to-text)
+		if ($fileSize > 25 * 1000 * 1000) {
+			throw new UserFacingProcessingException(
+				'Filesize of input is too large. Max is 25MB',
+				0,
+				null,
+				$this->l->t('The input file size is too large. A maximum of 25MB is allowed.'),
+			);
+		}
+
+		$fileType = $input['input']->getMimeType();
+		if (!str_starts_with($fileType, 'audio/')) {
+			throw new UserFacingProcessingException(
+				'Invalid input file type ' . $fileType,
+				0,
+				null,
+				$this->l->t('The input file type is invalid. Only audio files are allowed.'),
+			);
+		}
+		if ($this->openAiAPIService->isUsingOpenAi()) {
+			$validFileTypes = [
+				'audio/mp3',
+				'audio/mp4',
+				'audio/mpeg',
+				'audio/mpga',
+				'audio/m4a',
+				'audio/wav',
+				'audio/webm',
+			];
+			if (!in_array($fileType, $validFileTypes)) {
+				throw new ProcessingException('Invalid input file type for OpenAI ' . $fileType);
+			}
+		}
+
+		$inputFile = $input['input'];
+		$format = $input['format'];
+		$language = $input['language'] ?? 'default';
+		if (!is_string($language)) {
+			throw new ProcessingException('Invalid language');
+		}
+
+		$model = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID, lazy: true) ?: Application::DEFAULT_MODEL_ID;
+
+		try {
+			$transcription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $model, $language, $format);
+			return ['output' => $transcription];
+		} catch (UserFacingProcessingException $e) {
+			throw $e;
+		} catch (\Throwable $e) {
+			$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new ProcessingException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage());
+		}
+	}
+}
diff --git a/psalm.xml b/psalm.xml
@@ -48,6 +48,7 @@
 				<referencedClass name="OCP\TaskProcessing\TaskTypes\TextToTextReformatParagraphs" />
 				<referencedClass name="OCP\TaskProcessing\TaskTypes\TextToTextChatWithTools" />
 				<referencedClass name="OCP\TaskProcessing\TaskTypes\AudioToAudioChat" />
+				<referencedClass name="OCP\TaskProcessing\TaskTypes\AudioToTextSubtitles" />
 			</errorLevel>
 		</UndefinedClass>
 		<UndefinedDocblockClass>