Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
35b0977
reduce complexity of un_on_flow func
Omswastik-11 Jan 4, 2026
5a4a089
Merge branch 'main' into issue-1580
Omswastik-11 Jan 5, 2026
1a006fb
Merge branch 'main' into pr/1596
fkiraly Jan 7, 2026
93aa877
refactor the helping functions for un_on_flow func
Omswastik-11 Jan 13, 2026
331b4be
Merge branch 'issue-1580' of https://github.com/Omswastik-11/openml-p…
Omswastik-11 Jan 13, 2026
6771fb4
remove redudandent checkings
Omswastik-11 Jan 13, 2026
22f52a8
Merge branch 'main' into issue-1580
Omswastik-11 Jan 14, 2026
04a6e0f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 14, 2026
d3460d0
Merge branch 'main' into issue-1580
fkiraly Jan 14, 2026
1ec0302
Merge branch 'main' into issue-1580
Omswastik-11 Jan 15, 2026
df9a36a
Merge branch 'main' into issue-1580
Omswastik-11 Jan 26, 2026
cefae00
Merge branch 'main' into issue-1580
Omswastik-11 Feb 4, 2026
8d05331
added the tests
Omswastik-11 Feb 12, 2026
5a683da
correct the tests
Omswastik-11 Feb 12, 2026
c56e49d
Merge branch 'main' into issue-1580
Omswastik-11 Feb 16, 2026
111c134
Merge branch 'main' into issue-1580
Omswastik-11 Feb 17, 2026
c80b6c2
improve error messages and simplfy the control flow
Omswastik-11 Feb 17, 2026
cfe048c
merge
Omswastik-11 Feb 17, 2026
68f9d0d
Merge branch 'main' into issue-1580
Omswastik-11 Feb 17, 2026
6d8c129
Merge branch 'main' into issue-1580
Omswastik-11 Feb 25, 2026
6055521
Merge branch 'main' into issue-1580
fkiraly Mar 6, 2026
6c7a996
Merge branch 'main' into issue-1580
Omswastik-11 Apr 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
343 changes: 259 additions & 84 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections import OrderedDict
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -53,6 +53,236 @@
ERROR_CODE = 512


def _validate_flow_and_task_inputs(
flow: OpenMLFlow | OpenMLTask,
task: OpenMLTask | OpenMLFlow,
flow_tags: list[str] | None,
) -> tuple[OpenMLFlow, OpenMLTask]:
"""Validate and normalize inputs for flow and task execution.

Parameters
----------
flow : OpenMLFlow or OpenMLTask
The flow object (may be swapped with task for backward compatibility).
task : OpenMLTask or OpenMLFlow
The task object (may be swapped with flow for backward compatibility).
flow_tags : List[str] or None
A list of tags that the flow should have at creation.

Returns
-------
Tuple[OpenMLFlow, OpenMLTask]
The validated flow and task.

Raises
------
ValueError
If flow_tags is not a list or task is not published.
"""
if flow_tags is not None and not isinstance(flow_tags, list):
raise ValueError("flow_tags should be a list")

# TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
# Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
# We want to allow either order of argument (to avoid confusion).
warnings.warn(
"run_flow_on_task: the old argument order (task, flow) is deprecated and "
"will not be supported in the future. Please use the "
"order (flow, task).",
DeprecationWarning,
stacklevel=3,
)
task, flow = flow, task

if not isinstance(flow, OpenMLFlow):
raise TypeError(
f"run_flow_on_task: expected argument 'flow' to be OpenMLFlow, "
f"got {type(flow).__name__}",
)

if not isinstance(task, OpenMLTask):
raise TypeError(
f"run_flow_on_task: expected argument 'task' to be OpenMLTask, "
f"got {type(task).__name__}",
)

if task.task_id is None:
raise ValueError(
"run_flow_on_task: argument 'task.task_id' is None; task must be published on OpenML"
)

return flow, task


def _sync_flow_with_server(
flow: OpenMLFlow,
task: OpenMLTask,
*,
upload_flow: bool,
avoid_duplicate_runs: bool,
) -> int | None:
"""Synchronize flow with server and check if setup/task combination is already present.

Parameters
----------
flow : OpenMLFlow
The flow to synchronize.
task : OpenMLTask
The task to check for duplicate runs.
upload_flow : bool
Whether to upload the flow if it doesn't exist.
avoid_duplicate_runs : bool
Whether to check for duplicate runs.
Comment thread
Omswastik-11 marked this conversation as resolved.

Returns
-------
int or None
The flow_id if synced with server, None otherwise.

Raises
------
PyOpenMLError
If flow_id mismatch or flow doesn't exist when expected.
OpenMLRunsExistError
If duplicate runs exist and avoid_duplicate_runs is True.
"""
# We only need to sync with the server right now if we want to upload the flow,
# or ensure no duplicate runs exist. Otherwise it can be synced at upload time.
flow_id = None
Comment thread
Omswastik-11 marked this conversation as resolved.
if not upload_flow and not avoid_duplicate_runs:
return flow_id

flow_id = flow_exists(flow.name, flow.external_version)
if isinstance(flow.flow_id, int) and flow_id != flow.flow_id:
if flow_id is not False:
raise PyOpenMLError(
f"Local flow_id does not match server flow_id: '{flow.flow_id}' vs '{flow_id}'",
)
raise PyOpenMLError("Flow does not exist on the server, but 'flow.flow_id' is not None.")

if upload_flow and flow_id is False:
flow.publish()
return flow.flow_id

if flow_id:
flow_from_server = get_flow(flow_id)
_copy_server_fields(flow_from_server, flow)
if avoid_duplicate_runs:
flow_from_server.model = flow.model
setup_id = setup_exists(flow_from_server)
ids = run_exists(cast("int", task.task_id), setup_id)
if ids:
error_message = "One or more runs of this setup were already performed on the task."
raise OpenMLRunsExistError(ids, error_message)
return flow_id

# Flow does not exist on server and we do not want to upload it.
# No sync with the server happens.
return None


def _prepare_run_environment(flow: OpenMLFlow) -> tuple[list[str], list[str]]:
"""Prepare run environment information and tags.

Parameters
----------
flow : OpenMLFlow
The flow to get version information from.

Returns
-------
Tuple[List[str], List[str]]
A tuple of (tags, run_environment).
"""
run_environment = flow.extension.get_version_information()
tags = ["openml-python", run_environment[1]]
return tags, run_environment


def _create_run_from_results( # noqa: PLR0913
task: OpenMLTask,
flow: OpenMLFlow,
flow_id: int | None,
data_content: list[list],
trace: OpenMLRunTrace | None,
fold_evaluations: OrderedDict[str, OrderedDict],
sample_evaluations: OrderedDict[str, OrderedDict],
tags: list[str],
run_environment: list[str],
upload_flow: bool,
avoid_duplicate_runs: bool,
) -> OpenMLRun:
"""Create an OpenMLRun object from execution results.

Parameters
----------
task : OpenMLTask
The task that was executed.
flow : OpenMLFlow
The flow that was executed.
flow_id : int or None
The flow ID if synced with server.
data_content : List[List]
The prediction data content.
trace : OpenMLRunTrace or None
The execution trace if available.
fold_evaluations : OrderedDict
The fold-based evaluation measures.
sample_evaluations : OrderedDict
The sample-based evaluation measures.
tags : List[str]
Tags to attach to the run.
run_environment : List[str]
Environment information.
upload_flow : bool
Whether the flow was uploaded.
avoid_duplicate_runs : bool
Whether duplicate runs were checked.

Returns
-------
OpenMLRun
The created run object.
"""
dataset = task.get_dataset()
task_id = cast("int", task.task_id)
dataset_id = dataset.dataset_id
model = flow.model
flow_name = flow.name
setup_string = flow.extension.create_setup_string(flow.model)
fields = [*run_environment, time.strftime("%c"), "Created by run_flow_on_task"]
generated_description = "\n".join(fields)

run = OpenMLRun(
task_id=task_id,
flow_id=flow_id,
dataset_id=dataset_id,
model=model,
flow_name=flow_name,
tags=tags,
trace=trace,
data_content=data_content,
flow=flow,
setup_string=setup_string,
description_text=generated_description,
)

if (upload_flow or avoid_duplicate_runs) and flow.flow_id is not None:
# We only extract the parameter settings if a sync happened with the server.
# I.e. when the flow was uploaded or we found it in the avoid_duplicate check.
# Otherwise, we will do this at upload time.
run.parameter_settings = flow.extension.obtain_parameter_values(flow)

# now we need to attach the detailed evaluations
if task.task_type_id == TaskType.LEARNING_CURVE:
run.sample_evaluations = sample_evaluations
else:
run.fold_evaluations = fold_evaluations

return run


# TODO(eddiebergman): Could potentially overload this but
# it seems very big to do so
def run_model_on_task( # noqa: PLR0913
Expand Down Expand Up @@ -174,7 +404,7 @@ def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask:
return run


def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913
def run_flow_on_task( # noqa: PLR0913
flow: OpenMLFlow,
task: OpenMLTask,
avoid_duplicate_runs: bool | None = None,
Expand Down Expand Up @@ -221,116 +451,61 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913
run : OpenMLRun
Result of the run.
"""
if flow_tags is not None and not isinstance(flow_tags, list):
raise ValueError("flow_tags should be a list")

if avoid_duplicate_runs is None:
avoid_duplicate_runs = openml.config.avoid_duplicate_runs

# TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
# Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
# We want to allow either order of argument (to avoid confusion).
warnings.warn(
"The old argument order (Flow, model) is deprecated and "
"will not be supported in the future. Please use the "
"order (model, Flow).",
DeprecationWarning,
stacklevel=2,
)
task, flow = flow, task

if task.task_id is None:
raise ValueError("The task should be published at OpenML")
# 1. Validate inputs
flow, task = _validate_flow_and_task_inputs(flow, task, flow_tags)

# 2. Prepare the model
if flow.model is None:
flow.model = flow.extension.flow_to_model(flow)

flow.model = flow.extension.seed_model(flow.model, seed=seed)

# We only need to sync with the server right now if we want to upload the flow,
# or ensure no duplicate runs exist. Otherwise it can be synced at upload time.
flow_id = None
if upload_flow or avoid_duplicate_runs:
flow_id = flow_exists(flow.name, flow.external_version)
if isinstance(flow.flow_id, int) and flow_id != flow.flow_id:
if flow_id is not False:
raise PyOpenMLError(
f"Local flow_id does not match server flow_id: '{flow.flow_id}' vs '{flow_id}'",
)
raise PyOpenMLError(
"Flow does not exist on the server, but 'flow.flow_id' is not None."
)
if upload_flow and flow_id is False:
flow.publish()
flow_id = flow.flow_id
elif flow_id:
flow_from_server = get_flow(flow_id)
_copy_server_fields(flow_from_server, flow)
if avoid_duplicate_runs:
flow_from_server.model = flow.model
setup_id = setup_exists(flow_from_server)
ids = run_exists(task.task_id, setup_id)
if ids:
error_message = (
"One or more runs of this setup were already performed on the task."
)
raise OpenMLRunsExistError(ids, error_message)
else:
# Flow does not exist on server and we do not want to upload it.
# No sync with the server happens.
flow_id = None

dataset = task.get_dataset()
# 3. Sync with server and check for duplicates
flow_id = _sync_flow_with_server(
flow,
task,
upload_flow=upload_flow,
avoid_duplicate_runs=avoid_duplicate_runs,
)

run_environment = flow.extension.get_version_information()
tags = ["openml-python", run_environment[1]]
# 4. Prepare run environment
tags, run_environment = _prepare_run_environment(flow)

# 5. Check if model is already fitted
if flow.extension.check_if_model_fitted(flow.model):
warnings.warn(
"The model is already fitted! This might cause inconsistency in comparison of results.",
RuntimeWarning,
stacklevel=2,
)

# execute the run
res = _run_task_get_arffcontent(
# 6. Execute the run (parallel processing happens here)
data_content, trace, fold_evaluations, sample_evaluations = _run_task_get_arffcontent(
model=flow.model,
task=task,
extension=flow.extension,
add_local_measures=add_local_measures,
n_jobs=n_jobs,
)
Comment on lines +485 to 491
Copy link

Copilot AI Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description mentions introducing a _RunResults NamedTuple to bundle execution outputs and reduce long parameter lists, but this NamedTuple is not present in the actual implementation. The function _run_task_get_arffcontent still returns a tuple that is unpacked directly in line 486. If the NamedTuple was intended but not implemented, consider either updating the PR description to match the implementation or implementing the NamedTuple as described.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Omswastik-11 update the PR description to remove this part.


data_content, trace, fold_evaluations, sample_evaluations = res
fields = [*run_environment, time.strftime("%c"), "Created by run_flow_on_task"]
generated_description = "\n".join(fields)
run = OpenMLRun(
task_id=task.task_id,
# 7. Create run from results
run = _create_run_from_results(
task=task,
flow=flow,
flow_id=flow_id,
dataset_id=dataset.dataset_id,
Comment thread
Omswastik-11 marked this conversation as resolved.
model=flow.model,
flow_name=flow.name,
tags=tags,
trace=trace,
data_content=data_content,
flow=flow,
setup_string=flow.extension.create_setup_string(flow.model),
description_text=generated_description,
trace=trace,
fold_evaluations=fold_evaluations,
sample_evaluations=sample_evaluations,
tags=tags,
run_environment=run_environment,
upload_flow=upload_flow,
avoid_duplicate_runs=avoid_duplicate_runs,
)

if (upload_flow or avoid_duplicate_runs) and flow.flow_id is not None:
# We only extract the parameter settings if a sync happened with the server.
# I.e. when the flow was uploaded or we found it in the avoid_duplicate check.
# Otherwise, we will do this at upload time.
run.parameter_settings = flow.extension.obtain_parameter_values(flow)

# now we need to attach the detailed evaluations
if task.task_type_id == TaskType.LEARNING_CURVE:
run.sample_evaluations = sample_evaluations
else:
run.fold_evaluations = fold_evaluations

# 8. Log completion message
if flow_id:
message = f"Executed Task {task.task_id} with Flow id:{run.flow_id}"
else:
Comment thread
Omswastik-11 marked this conversation as resolved.
Expand Down
Loading
Loading