Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions docs/portfolio/agent_runtime_reliability_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ Future hardening, not current completion:
AIGuard `guard_analysis` also includes `sustained_overload_risk`, which Lab
preserves as report evidence and reflects in the agent deployment decision
context.
Newer AIGuard guard analysis can also include `stale_frame_risk` or preserved
`edgeenv_orchestrator_stale_drop_summary` evidence. Lab surfaces the stale-drop
count, stale-drop rate, affected tasks, reason counts, and reason classes in
the AIGuard Orchestrator Operation Evidence section as deployment review
context.

The report also preserves the Orchestrator operation-health fields added for
runtime operation review:
Expand Down Expand Up @@ -151,8 +156,9 @@ runtime operation review:
- AIGuard Orchestrator operation evidence, including
`worker_health_degradation` and `scheduler_delay_pattern` when Orchestrator
worker health or runtime event telemetry is analyzed by AIGuard.
Lab preserves health reasons, policy/drop reason counts, and scheduler delay
counts as deployment context without making AIGuard the final decision owner.
Lab preserves health reasons, policy/drop/stale-drop reason counts, scheduler
delay counts, stale-drop affected tasks, and stale-drop boundary markers as
deployment context without making AIGuard the final decision owner.

These fields make the report path explicit:

Expand Down Expand Up @@ -204,8 +210,10 @@ The report also surfaces Orchestrator operation guard evidence as context. For
example, `worker_health_degradation` shows degraded/constrained worker reasons
such as fallback policy use or dropped frames, while `scheduler_delay_pattern`
shows scheduler delay counts and related policy/drop reasons. These evidence
items contribute through AIGuard's overall guard verdict and remain separate
from Lab's final policy ownership.
shows scheduler delay counts and related policy/drop reasons.
`stale_frame_risk` shows which tasks had stale/backlog drops and why. These
evidence items contribute through AIGuard's overall guard verdict and remain
separate from Lab's final policy ownership.

## Boundary

Expand Down
55 changes: 52 additions & 3 deletions examples/agent_runtime/aiguard_runtime_guard_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,21 +55,70 @@
"queue_depth_sample_count": 1,
"latency_sample_count": 1
}
},
{
"type": "stale_frame_risk",
"metric_name": "stale_drop_rate",
"observed_value": 0.714,
"baseline_value": 0,
"threshold": 0.2,
"delta": null,
"delta_pct": null,
"increase_factor": null,
"severity": "high",
"status": "failed",
"explanation": "Orchestrator reported 5 stale/backlog drop events.",
"why_it_matters": "Stale frame or backlog drops can protect high-priority work, but they also show that lower-priority Vision or command workloads may lose fresh inputs under sustained multi-agent load.",
"suspected_causes": [
"load_shedding_context",
"stale_queue_overflow"
],
"recommendation": "Review tasks_with_stale_drop, stale_drop_reasons, queue depth, producer rate, and fallback policy in Lab before treating the operation profile as stable.",
"raw_context": {
"stale_drop_count": 5,
"total_drop_count": 7,
"stale_drop_rate": 0.714,
"stale_drop_reasons": {
"load_shedding_backlog_threshold_exceeded": 3,
"queue_overflow_drop_oldest": 2
},
"stale_drop_reason_classes": [
"load_shedding_stale_backlog",
"stale_queue_overflow"
],
"tasks_with_stale_drop": [
"vision_agent",
"voice_command_agent"
],
"latest_stale_drop_event": {
"task": "voice_command_agent",
"agent_id": "voice_command_agent",
"reason": "queue_overflow_drop_oldest",
"stale_drop_class": "stale_queue_overflow"
},
"decision_owner": "lab",
"scheduler_owner": "orchestrator",
"not_a_deployment_decision": true
}
}
],
"suspected_causes": [
"queue_backlog",
"overload_load_shedding",
"producer_rate_exceeds_runtime_capacity",
"sustained_multi_agent_overload"
"sustained_multi_agent_overload",
"stale_queue_overflow"
],
"recommendations": [
"Tune target FPS, queue size, drop policy, or fallback policy for affected agents.",
"Lower producer rate, tighten stale-frame drop policy, or move lower priority work behind a fallback path before deployment."
"Lower producer rate, tighten stale-frame drop policy, or move lower priority work behind a fallback path before deployment.",
"Review stale drop reasons and affected agent workloads in Lab."
],
"thresholds": {
"drop_rate_review": 0.2,
"drop_rate_blocked": 0.5
"drop_rate_blocked": 0.5,
"stale_drop_rate_review": 0.2,
"stale_drop_rate_blocked": 0.5
},
"baseline_summary": {},
"candidate_summary": {
Expand Down
4 changes: 4 additions & 0 deletions inferedgelab/commands/agent_runtime_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def _text_summary(report: dict) -> str:
remote_runtime_event_summary = (
remote_context.get("remote_runtime_event_summary") or {}
)
orchestrator_guard = report.get("orchestrator_operation_guard_summary") or {}
edgeenv_context = (
report["agent_runtime_summary"].get("edgeenv_preservation_context") or {}
)
Expand Down Expand Up @@ -113,6 +114,9 @@ def _text_summary(report: dict) -> str:
f"remote_runtime_event_count: {remote_runtime_event_summary.get('runtime_event_count')}",
f"remote_runtime_event_final_status: {remote_runtime_event_summary.get('final_status')}",
f"remote_runtime_summary_boundary: {remote_runtime_event_summary.get('operation_boundary')}",
f"stale_drop_count: {orchestrator_guard.get('stale_drop_count')}",
f"stale_drop_rate: {orchestrator_guard.get('stale_drop_rate')}",
f"tasks_with_stale_drop: {', '.join(orchestrator_guard.get('tasks_with_stale_drop') or [])}",
f"edgeenv_run_id: {edgeenv_context.get('run_id')}",
f"edgeenv_runtime_operation_health: {edgeenv_context.get('runtime_operation_health_reason')}",
f"edgeenv_runtime_operation_action: {edgeenv_context.get('runtime_operation_recommended_action')}",
Expand Down
97 changes: 97 additions & 0 deletions inferedgelab/services/agent_runtime_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
ORCHESTRATOR_OPERATION_GUARD_EVIDENCE_TYPES = {
"worker_health_degradation",
"scheduler_delay_pattern",
"operation_timeline_summary",
"stale_frame_risk",
"edgeenv_orchestrator_operation_timeline_summary",
"edgeenv_orchestrator_stale_drop_summary",
}

DEFAULT_AGENT_RUNTIME_THRESHOLDS = {
Expand Down Expand Up @@ -877,6 +881,11 @@ def build_agent_runtime_reliability_markdown(report: dict[str, Any]) -> str:
f"| policy_decision_reasons | {_fmt_mapping(orchestrator_guard.get('policy_decision_reason_counts'))} |",
f"| drop_reasons | {_fmt_mapping(orchestrator_guard.get('drop_reason_counts'))} |",
f"| scheduler_delay_event_count | {_fmt_number(orchestrator_guard.get('scheduler_delay_event_count'))} |",
f"| stale_drop_count | {_fmt_number(orchestrator_guard.get('stale_drop_count'))} |",
f"| stale_drop_rate | {_fmt_number(orchestrator_guard.get('stale_drop_rate'))} |",
f"| tasks_with_stale_drop | {', '.join(orchestrator_guard.get('tasks_with_stale_drop') or []) or '-'} |",
f"| stale_drop_reasons | {_fmt_mapping(orchestrator_guard.get('stale_drop_reasons'))} |",
f"| stale_drop_reason_classes | {', '.join(orchestrator_guard.get('stale_drop_reason_classes') or []) or '-'} |",
"",
"Orchestrator operation guard evidence:",
"",
Expand Down Expand Up @@ -1494,7 +1503,12 @@ def _orchestrator_operation_guard_summary(
health_reason_counts: dict[str, int] = {}
policy_decision_reason_counts: dict[str, int] = {}
drop_reason_counts: dict[str, int] = {}
stale_drop_reason_counts: dict[str, int] = {}
stale_drop_reason_classes: list[str] = []
tasks_with_stale_drop: list[str] = []
scheduler_delay_event_count = 0.0
stale_drop_count = 0.0
stale_drop_rate = 0.0
for item in evidence:
detail = _orchestrator_operation_guard_detail(item)
health_reason_counts = _merge_count_maps(
Expand All @@ -1509,10 +1523,34 @@ def _orchestrator_operation_guard_summary(
drop_reason_counts,
detail.get("drop_reason_counts"),
)
stale_drop_reason_counts = _merge_count_maps(
stale_drop_reason_counts,
detail.get("stale_drop_reasons"),
)
stale_drop_reason_classes = _unique_strings(
[
*stale_drop_reason_classes,
*_string_list(detail.get("stale_drop_reason_classes")),
]
)
tasks_with_stale_drop = _unique_strings(
[
*tasks_with_stale_drop,
*_string_list(detail.get("tasks_with_stale_drop")),
]
)
scheduler_delay_event_count = max(
scheduler_delay_event_count,
_non_negative_number(detail.get("scheduler_delay_event_count")),
)
stale_drop_count = max(
stale_drop_count,
_non_negative_number(detail.get("stale_drop_count")),
)
stale_drop_rate = max(
stale_drop_rate,
_non_negative_number(detail.get("stale_drop_rate")),
)
metric_context = metrics if isinstance(metrics, dict) else {}
if not policy_decision_reason_counts:
policy_decision_reason_counts = _count_mapping(
Expand Down Expand Up @@ -1547,6 +1585,11 @@ def _orchestrator_operation_guard_summary(
"policy_decision_reason_counts": policy_decision_reason_counts,
"drop_reason_counts": drop_reason_counts,
"scheduler_delay_event_count": scheduler_delay_event_count,
"stale_drop_count": stale_drop_count,
"stale_drop_rate": stale_drop_rate,
"stale_drop_reasons": stale_drop_reason_counts,
"stale_drop_reason_classes": stale_drop_reason_classes,
"tasks_with_stale_drop": tasks_with_stale_drop,
"evidence": [
{
"type": item.get("type"),
Expand Down Expand Up @@ -1575,6 +1618,7 @@ def _orchestrator_operation_guard_detail(
health_reason_counts = {}
if isinstance(worker_health, dict):
health_reason_counts = _count_mapping(worker_health.get("health_reason_counts"))
stale_drop = _stale_drop_guard_context(raw_context)
return {
"health_reason_counts": health_reason_counts,
"policy_decision_reason_counts": _count_mapping(
Expand All @@ -1587,6 +1631,59 @@ def _orchestrator_operation_guard_detail(
"scheduler_delay_event_count": _non_negative_number(
raw_context.get("scheduler_delay_event_count")
),
"stale_drop_count": stale_drop.get("stale_drop_count", 0.0),
"total_drop_count": stale_drop.get("total_drop_count", 0.0),
"stale_drop_rate": stale_drop.get("stale_drop_rate", 0.0),
"stale_drop_reasons": stale_drop.get("stale_drop_reasons", {}),
"stale_drop_reason_classes": stale_drop.get(
"stale_drop_reason_classes", []
),
"tasks_with_stale_drop": stale_drop.get("tasks_with_stale_drop", []),
"latest_stale_drop_event": stale_drop.get("latest_stale_drop_event", {}),
"stale_drop_boundary_markers_valid": stale_drop.get(
"boundary_markers_valid"
),
"stale_drop_decision_owner": stale_drop.get("decision_owner"),
"stale_drop_scheduler_owner": stale_drop.get("scheduler_owner"),
"stale_drop_not_a_deployment_decision": stale_drop.get(
"not_a_deployment_decision"
),
}


def _stale_drop_guard_context(raw_context: dict[str, Any]) -> dict[str, Any]:
candidate = raw_context.get("stale_drop_summary")
if not isinstance(candidate, dict):
candidate = {}
summary = candidate.get("summary")
if isinstance(summary, dict):
source = {**summary, **candidate}
source.pop("summary", None)
else:
source = {**candidate, **raw_context}
total_drop_count = _non_negative_number(source.get("total_drop_count"))
stale_drop_count = _non_negative_number(source.get("stale_drop_count"))
stale_drop_rate = _non_negative_number(source.get("stale_drop_rate"))
if stale_drop_rate <= 0 and total_drop_count > 0:
stale_drop_rate = stale_drop_count / total_drop_count
return {
"stale_drop_count": stale_drop_count,
"total_drop_count": total_drop_count,
"stale_drop_rate": stale_drop_rate,
"stale_drop_reasons": _count_mapping(source.get("stale_drop_reasons")),
"stale_drop_reason_classes": _string_list(
source.get("stale_drop_reason_classes")
),
"tasks_with_stale_drop": _string_list(source.get("tasks_with_stale_drop")),
"latest_stale_drop_event": (
dict(source.get("latest_stale_drop_event"))
if isinstance(source.get("latest_stale_drop_event"), dict)
else {}
),
"boundary_markers_valid": source.get("boundary_markers_valid"),
"decision_owner": source.get("decision_owner"),
"scheduler_owner": source.get("scheduler_owner"),
"not_a_deployment_decision": source.get("not_a_deployment_decision"),
}


Expand Down
Loading
Loading