diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
index 80b30dc2..107f9332 100644
--- a/lib/AppInfo/Application.php
+++ b/lib/AppInfo/Application.php
@@ -15,6 +15,7 @@
use OCA\OpenAi\TaskProcessing\AudioToAudioTranslateTaskType;
use OCA\OpenAi\TaskProcessing\AudioToTextEnhancedProvider;
use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
+use OCA\OpenAi\TaskProcessing\AudioToTextSubtitlesProvider;
use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
use OCA\OpenAi\TaskProcessing\ContextWriteProvider;
use OCA\OpenAi\TaskProcessing\EmojiProvider;
@@ -51,6 +52,10 @@ class Application extends App implements IBootstrap {
'alloy', 'ash', 'ballad', 'coral', 'echo', 'fable',
'onyx', 'nova', 'sage', 'shimmer', 'verse'
];
+ public const DEFAULT_SUBTITLE_FORMAT = 'srt';
+ public const DEFAULT_SUBTITLE_FORMATS = [
+ 'srt', 'vtt'
+ ];
public const DEFAULT_DEFAULT_IMAGE_SIZE = '1024x1024';
public const MAX_GENERATION_IDLE_TIME = 60 * 60 * 24 * 10;
public const DEFAULT_CHUNK_SIZE = 10000;
@@ -116,6 +121,9 @@ public function register(IRegistrationContext $context): void {
}
if ($sttProviderEnabled) {
$context->registerTaskProcessingProvider(AudioToTextProvider::class);
+ if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToTextSubtitles')) {
+ $context->registerTaskProcessingProvider(AudioToTextSubtitlesProvider::class);
+ }
if (class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToTextReformatParagraphs')) {
$context->registerTaskProcessingProvider(AudioToTextEnhancedProvider::class);
}
diff --git a/lib/Service/OpenAiAPIService.php b/lib/Service/OpenAiAPIService.php
index bc798a68..1cca0094 100644
--- a/lib/Service/OpenAiAPIService.php
+++ b/lib/Service/OpenAiAPIService.php
@@ -865,6 +865,7 @@ public function transcribeBase64Mp3(
* @param bool $translate
* @param string $model
* @param string $language
+ * @param string $responseFormat
* @return string
* @throws Exception
*/
@@ -874,9 +875,10 @@ public function transcribeFile(
bool $translate = false,
string $model = Application::DEFAULT_MODEL_ID,
string $language = 'default',
+ string $responseFormat = 'verbose_json',
): string {
try {
- $transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language);
+ $transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language, $responseFormat);
} catch (NotPermittedException|LockedException|GenericFileException $e) {
$this->logger->warning('Could not read audio file: ' . $file->getPath() . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]);
throw new Exception($this->l10n->t('Could not read audio file.'), Http::STATUS_INTERNAL_SERVER_ERROR);
@@ -891,6 +893,7 @@ public function transcribeFile(
* @param bool $translate
* @param string $model
* @param string $language
+ * @param string $responseFormat
* @return string
* @throws Exception
*/
@@ -900,6 +903,7 @@ public function transcribe(
bool $translate = true,
string $model = Application::DEFAULT_MODEL_ID,
string $language = 'default',
+ string $responseFormat = 'verbose_json', // Verbose needed for extraction of audio duration
): string {
if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TRANSCRIPTION)) {
throw new Exception($this->l10n->t('Audio transcription quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
@@ -912,8 +916,7 @@ public function transcribe(
$params = [
'model' => $model === Application::DEFAULT_MODEL_ID ? Application::DEFAULT_TRANSCRIPTION_MODEL_ID : $model,
'file' => $audioFileContent,
- 'response_format' => 'verbose_json',
- // Verbose needed for extraction of audio duration
+ 'response_format' => $responseFormat,
];
// Gets the user's preferred language if it's not the default one
if ($language === 'default') {
@@ -927,6 +930,34 @@ public function transcribe(
$response = $this->request($userId, $endpoint, $params, 'POST', $contentType, serviceType: Application::SERVICE_TYPE_STT);
+ if (in_array($responseFormat, Application::DEFAULT_SUBTITLE_FORMATS)) {
+ if (!isset($response['body'])) {
+ $this->logger->warning('Audio subtitling error: ' . json_encode($response));
+ throw new Exception($this->l10n->t('Unknown audio subtitling error'), Http::STATUS_INTERNAL_SERVER_ERROR);
+ }
+
+ // Extract audio duration from response and store it as quota usage:
+ $matches = [];
+ $isMatch = preg_match_all('/(\d\d):(\d\d):(\d\d)[\.,](\d\d\d)/', $response['body'], $matches, PREG_SET_ORDER);
+
+ if ($isMatch !== false && $isMatch > 0) {
+ $lastTimestamp = end($matches);
+ $hours = intval($lastTimestamp[1]);
+ $minutes = intval($lastTimestamp[2]);
+ $seconds = intval($lastTimestamp[3]);
+ $millisecondAdjustment = intval(round(floatval($lastTimestamp[4]) / 1000.0));
+ $audioDuration = ($hours * 3600) + ($minutes * 60) + $seconds + $millisecondAdjustment;
+
+ try {
+ $this->createQuotaUsage($userId ?? '', Application::QUOTA_TYPE_TRANSCRIPTION, $audioDuration);
+ } catch (DBException $e) {
+ $this->logger->warning('Could not create quota usage for user: ' . $userId . ' and quota type: ' . Application::QUOTA_TYPE_TRANSCRIPTION . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]);
+ }
+ }
+
+ return $response['body'];
+ }
+
if (!isset($response['text'])) {
$this->logger->warning('Audio transcription error: ' . json_encode($response));
throw new Exception($this->l10n->t('Unknown audio trancription error'), Http::STATUS_INTERNAL_SERVER_ERROR);
diff --git a/lib/TaskProcessing/AudioToTextSubtitlesProvider.php b/lib/TaskProcessing/AudioToTextSubtitlesProvider.php
new file mode 100644
index 00000000..864e87c7
--- /dev/null
+++ b/lib/TaskProcessing/AudioToTextSubtitlesProvider.php
@@ -0,0 +1,168 @@
+openAiAPIService->getServiceName(Application::SERVICE_TYPE_STT);
+ }
+
+ public function getTaskTypeId(): string {
+ return AudioToTextSubtitles::ID;
+ }
+
+ public function getExpectedRuntime(): int {
+ return $this->openAiAPIService->getExpTextProcessingTime();
+ }
+
+ public function getInputShapeEnumValues(): array {
+ return [];
+ }
+
+ public function getInputShapeDefaults(): array {
+ return [];
+ }
+
+ public function getOptionalInputShape(): array {
+ return [
+ 'language' => new ShapeDescriptor(
+ $this->l->t('Language'),
+ $this->l->t('The language of the audio file'),
+ EShapeType::Enum
+ ),
+ 'format' => new ShapeDescriptor(
+ $this->l->t('File format'),
+ $this->l->t('The format of the subtitles file'),
+ EShapeType::Enum
+ ),
+ ];
+ }
+
+ public function getOptionalInputShapeEnumValues(): array {
+ $languageEnumValues = array_map(static function (array $language) {
+ return new ShapeEnumValue($language[1], $language[0]);
+ }, Application::LANGUAGE_CODES_AND_ENDONYMS);
+ $detectLanguageEnumValue = new ShapeEnumValue($this->l->t('Detect language'), 'detect_language');
+ $defaultLanguageEnumValue = new ShapeEnumValue($this->l->t('Default'), 'default');
+ return [
+ 'language' => array_merge([$detectLanguageEnumValue, $defaultLanguageEnumValue], $languageEnumValues),
+ 'format' => [
+ new ShapeEnumValue($this->l->t('SubRip Text'), 'srt'),
+ new ShapeEnumValue($this->l->t('WebVTT'), 'vtt'),
+ ],
+ ];
+ }
+
+ public function getOptionalInputShapeDefaults(): array {
+ return [
+ 'language' => 'default',
+ 'format' => 'srt',
+ ];
+ }
+
+ public function getOutputShapeEnumValues(): array {
+ return [];
+ }
+
+ public function getOptionalOutputShape(): array {
+ return [];
+ }
+
+ public function getOptionalOutputShapeEnumValues(): array {
+ return [];
+ }
+
+ public function process(?string $userId, array $input, callable $reportProgress): array {
+ if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) {
+ throw new ProcessingException('Invalid input file');
+ }
+
+ $fileSize = intval($input['input']->getSize());
+ // Maximum file size for OpenAI is 25MB. (https://developers.openai.com/api/docs/guides/speech-to-text)
+ if ($fileSize > 25 * 1000 * 1000) {
+ throw new UserFacingProcessingException(
+ 'Filesize of input is too large. Max is 25MB',
+ 0,
+ null,
+ $this->l->t('The input file size is too large. A maximum of 25MB is allowed.'),
+ );
+ }
+
+ $fileType = $input['input']->getMimeType();
+ if (!str_starts_with($fileType, 'audio/')) {
+ throw new UserFacingProcessingException(
+ 'Invalid input file type ' . $fileType,
+ 0,
+ null,
+ $this->l->t('The input file type is invalid. Only audio files are allowed.'),
+ );
+ }
+ if ($this->openAiAPIService->isUsingOpenAi()) {
+ $validFileTypes = [
+ 'audio/mp3',
+ 'audio/mp4',
+ 'audio/mpeg',
+ 'audio/mpga',
+ 'audio/m4a',
+ 'audio/wav',
+ 'audio/webm',
+ ];
+ if (!in_array($fileType, $validFileTypes)) {
+ throw new ProcessingException('Invalid input file type for OpenAI ' . $fileType);
+ }
+ }
+
+ $inputFile = $input['input'];
+ $format = $input['format'];
+ $language = $input['language'] ?? 'default';
+ if (!is_string($language)) {
+ throw new ProcessingException('Invalid language');
+ }
+
+ $model = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID, lazy: true) ?: Application::DEFAULT_MODEL_ID;
+
+ try {
+ $transcription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $model, $language, $format);
+ return ['output' => $transcription];
+ } catch (UserFacingProcessingException $e) {
+ throw $e;
+ } catch (\Throwable $e) {
+ $this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+ throw new ProcessingException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage());
+ }
+ }
+}
diff --git a/psalm.xml b/psalm.xml
index 021f01c9..a3832f2d 100644
--- a/psalm.xml
+++ b/psalm.xml
@@ -48,6 +48,7 @@
+