Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use OCA\OpenAi\TaskProcessing\AudioToAudioTranslateTaskType;
use OCA\OpenAi\TaskProcessing\AudioToTextEnhancedProvider;
use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
use OCA\OpenAi\TaskProcessing\AudioToTextSubtitlesProvider;
use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
use OCA\OpenAi\TaskProcessing\ContextWriteProvider;
use OCA\OpenAi\TaskProcessing\EmojiProvider;
Expand Down Expand Up @@ -51,6 +52,10 @@ class Application extends App implements IBootstrap {
'alloy', 'ash', 'ballad', 'coral', 'echo', 'fable',
'onyx', 'nova', 'sage', 'shimmer', 'verse'
];
public const DEFAULT_SUBTITLE_FORMAT = 'srt';

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems unused

public const DEFAULT_SUBTITLE_FORMATS = [
'srt', 'vtt'
];
Comment on lines +56 to +58

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: how does SUPPORTED_SUBTITLE_FORMATS sound? Default would imply it can be changed.

public const DEFAULT_DEFAULT_IMAGE_SIZE = '1024x1024';
public const MAX_GENERATION_IDLE_TIME = 60 * 60 * 24 * 10;
public const DEFAULT_CHUNK_SIZE = 10000;
Expand Down Expand Up @@ -116,6 +121,9 @@ public function register(IRegistrationContext $context): void {
}
if ($sttProviderEnabled) {
$context->registerTaskProcessingProvider(AudioToTextProvider::class);
if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToTextSubtitles')) {
Comment thread
julien-nc marked this conversation as resolved.
$context->registerTaskProcessingProvider(AudioToTextSubtitlesProvider::class);
}
if (class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToTextReformatParagraphs')) {
$context->registerTaskProcessingProvider(AudioToTextEnhancedProvider::class);
}
Expand Down
37 changes: 34 additions & 3 deletions lib/Service/OpenAiAPIService.php
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,7 @@ public function transcribeBase64Mp3(
* @param bool $translate
* @param string $model
* @param string $language
* @param string $responseFormat
* @return string
* @throws Exception
*/
Expand All @@ -874,9 +875,10 @@ public function transcribeFile(
bool $translate = false,
string $model = Application::DEFAULT_MODEL_ID,
string $language = 'default',
string $responseFormat = 'verbose_json',
): string {
try {
$transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language);
$transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language, $responseFormat);
} catch (NotPermittedException|LockedException|GenericFileException $e) {
$this->logger->warning('Could not read audio file: ' . $file->getPath() . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]);
throw new Exception($this->l10n->t('Could not read audio file.'), Http::STATUS_INTERNAL_SERVER_ERROR);
Expand All @@ -891,6 +893,7 @@ public function transcribeFile(
* @param bool $translate
* @param string $model
* @param string $language
* @param string $responseFormat
* @return string
* @throws Exception
*/
Expand All @@ -900,6 +903,7 @@ public function transcribe(
bool $translate = true,
string $model = Application::DEFAULT_MODEL_ID,
string $language = 'default',
string $responseFormat = 'verbose_json', // Verbose needed for extraction of audio duration
): string {
if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TRANSCRIPTION)) {
throw new Exception($this->l10n->t('Audio transcription quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
Expand All @@ -912,8 +916,7 @@ public function transcribe(
$params = [
'model' => $model === Application::DEFAULT_MODEL_ID ? Application::DEFAULT_TRANSCRIPTION_MODEL_ID : $model,
'file' => $audioFileContent,
'response_format' => 'verbose_json',
// Verbose needed for extraction of audio duration
'response_format' => $responseFormat,
];
// Gets the user's preferred language if it's not the default one
if ($language === 'default') {
Expand All @@ -927,6 +930,34 @@ public function transcribe(

$response = $this->request($userId, $endpoint, $params, 'POST', $contentType, serviceType: Application::SERVICE_TYPE_STT);

if (in_array($responseFormat, Application::DEFAULT_SUBTITLE_FORMATS)) {
if (!isset($response['body'])) {
$this->logger->warning('Audio subtitling error: ' . json_encode($response));
throw new Exception($this->l10n->t('Unknown audio subtitling error'), Http::STATUS_INTERNAL_SERVER_ERROR);
}

// Extract audio duration from response and store it as quota usage:
$matches = [];
$isMatch = preg_match_all('/(\d\d):(\d\d):(\d\d)[\.,](\d\d\d)/', $response['body'], $matches, PREG_SET_ORDER);

if ($isMatch !== false && $isMatch > 0) {
$lastTimestamp = end($matches);
$hours = intval($lastTimestamp[1]);
$minutes = intval($lastTimestamp[2]);
$seconds = intval($lastTimestamp[3]);
$millisecondAdjustment = intval(round(floatval($lastTimestamp[4]) / 1000.0));
$audioDuration = ($hours * 3600) + ($minutes * 60) + $seconds + $millisecondAdjustment;

try {
$this->createQuotaUsage($userId ?? '', Application::QUOTA_TYPE_TRANSCRIPTION, $audioDuration);
} catch (DBException $e) {
$this->logger->warning('Could not create quota usage for user: ' . $userId . ' and quota type: ' . Application::QUOTA_TYPE_TRANSCRIPTION . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]);
}
}

return $response['body'];
}

if (!isset($response['text'])) {
$this->logger->warning('Audio transcription error: ' . json_encode($response));
throw new Exception($this->l10n->t('Unknown audio trancription error'), Http::STATUS_INTERNAL_SERVER_ERROR);
Expand Down
168 changes: 168 additions & 0 deletions lib/TaskProcessing/AudioToTextSubtitlesProvider.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
<?php

declare(strict_types=1);

/**
* SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

namespace OCA\OpenAi\TaskProcessing;

use OCA\OpenAi\AppInfo\Application;
use OCA\OpenAi\Service\OpenAiAPIService;
use OCP\Files\File;
use OCP\IAppConfig;
use OCP\IL10N;
use OCP\TaskProcessing\EShapeType;
use OCP\TaskProcessing\Exception\ProcessingException;
use OCP\TaskProcessing\Exception\UserFacingProcessingException;
use OCP\TaskProcessing\ISynchronousProvider;
use OCP\TaskProcessing\ShapeDescriptor;
use OCP\TaskProcessing\ShapeEnumValue;
use OCP\TaskProcessing\TaskTypes\AudioToTextSubtitles;
use Psr\Log\LoggerInterface;

class AudioToTextSubtitlesProvider implements ISynchronousProvider {

public function __construct(
private OpenAiAPIService $openAiAPIService,
private LoggerInterface $logger,
private IAppConfig $appConfig,
private IL10N $l,
) {
}

public function getId(): string {
return Application::APP_ID . '-audio2text-subtitles';
}

public function getName(): string {
return $this->openAiAPIService->getServiceName(Application::SERVICE_TYPE_STT);
}

public function getTaskTypeId(): string {
return AudioToTextSubtitles::ID;
}

public function getExpectedRuntime(): int {
return $this->openAiAPIService->getExpTextProcessingTime();
}

public function getInputShapeEnumValues(): array {
return [];
}

public function getInputShapeDefaults(): array {
return [];
}

public function getOptionalInputShape(): array {
return [
'language' => new ShapeDescriptor(
$this->l->t('Language'),
$this->l->t('The language of the audio file'),
EShapeType::Enum
),
'format' => new ShapeDescriptor(
$this->l->t('File format'),
$this->l->t('The format of the subtitles file'),
EShapeType::Enum
),
];
}

public function getOptionalInputShapeEnumValues(): array {
$languageEnumValues = array_map(static function (array $language) {
return new ShapeEnumValue($language[1], $language[0]);
}, Application::LANGUAGE_CODES_AND_ENDONYMS);
$detectLanguageEnumValue = new ShapeEnumValue($this->l->t('Detect language'), 'detect_language');
$defaultLanguageEnumValue = new ShapeEnumValue($this->l->t('Default'), 'default');
return [
'language' => array_merge([$detectLanguageEnumValue, $defaultLanguageEnumValue], $languageEnumValues),
'format' => [
new ShapeEnumValue($this->l->t('SubRip Text'), 'srt'),
new ShapeEnumValue($this->l->t('WebVTT'), 'vtt'),
],
];
}

public function getOptionalInputShapeDefaults(): array {
return [
'language' => 'default',
'format' => 'srt',
];
}

public function getOutputShapeEnumValues(): array {
return [];
}

public function getOptionalOutputShape(): array {
return [];
}

public function getOptionalOutputShapeEnumValues(): array {
return [];
}

public function process(?string $userId, array $input, callable $reportProgress): array {
if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) {
throw new ProcessingException('Invalid input file');
}

$fileSize = intval($input['input']->getSize());
// Maximum file size for OpenAI is 25MB. (https://developers.openai.com/api/docs/guides/speech-to-text)
if ($fileSize > 25 * 1000 * 1000) {
throw new UserFacingProcessingException(
'Filesize of input is too large. Max is 25MB',
0,
null,
$this->l->t('The input file size is too large. A maximum of 25MB is allowed.'),
);
}

$fileType = $input['input']->getMimeType();
if (!str_starts_with($fileType, 'audio/')) {
throw new UserFacingProcessingException(
'Invalid input file type ' . $fileType,
0,
null,
$this->l->t('The input file type is invalid. Only audio files are allowed.'),
);
}
if ($this->openAiAPIService->isUsingOpenAi()) {
Comment thread
lukasdotcom marked this conversation as resolved.
$validFileTypes = [
'audio/mp3',
'audio/mp4',
'audio/mpeg',
'audio/mpga',
'audio/m4a',
'audio/wav',
'audio/webm',
];
if (!in_array($fileType, $validFileTypes)) {
throw new ProcessingException('Invalid input file type for OpenAI ' . $fileType);
}
}

$inputFile = $input['input'];
$format = $input['format'];
$language = $input['language'] ?? 'default';
if (!is_string($language)) {
throw new ProcessingException('Invalid language');
}

$model = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID, lazy: true) ?: Application::DEFAULT_MODEL_ID;

try {
$transcription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $model, $language, $format);
return ['output' => $transcription];
} catch (UserFacingProcessingException $e) {
throw $e;
} catch (\Throwable $e) {
$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
throw new ProcessingException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage());
}
}
}
1 change: 1 addition & 0 deletions psalm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
<referencedClass name="OCP\TaskProcessing\TaskTypes\TextToTextReformatParagraphs" />
<referencedClass name="OCP\TaskProcessing\TaskTypes\TextToTextChatWithTools" />
<referencedClass name="OCP\TaskProcessing\TaskTypes\AudioToAudioChat" />
<referencedClass name="OCP\TaskProcessing\TaskTypes\AudioToTextSubtitles" />
</errorLevel>
</UndefinedClass>
<UndefinedDocblockClass>
Expand Down
Loading