diff --git a/bugzooka/analysis/pr_analyzer.py b/bugzooka/analysis/pr_analyzer.py index 7d206b7..79d647c 100644 --- a/bugzooka/analysis/pr_analyzer.py +++ b/bugzooka/analysis/pr_analyzer.py @@ -49,37 +49,41 @@ def _sanitize_gemini_output(result: str) -> str: return sanitized -def _parse_pr_request(text: str) -> Optional[Tuple[str, str, str, str]]: +def _parse_pr_request(text: str) -> Optional[Tuple[str, str, list, str]]: """ Parse PR analysis request from text. - Expected format: "analyze pr: {github_url}, compare with {version}" + Expected formats: + Single PR: "analyze pr: {github_url}, compare with {version}" + Multi PR: "analyze pr: {url1} {url2}, compare with {version}" - Both PR URL and version are required. + All PRs must be from the same org/repo. Both PR URL(s) and version are required. :param text: Message text to parse - :return: Tuple of (organization, repository, pr_number, version) or None if invalid + :return: Tuple of (organization, repository, pr_numbers, version) or None if invalid """ - # Match GitHub PR URLs like https://github.com/org/repo/pull/123 pr_pattern = r"https?://github\.com/([^/]+)/([^/]+)/pull/(\d+)" - pr_match = re.search(pr_pattern, text) + pr_matches = re.findall(pr_pattern, text) - if not pr_match: + if not pr_matches: return None - org, repo, pr_number = pr_match.groups() + org, repo, _ = pr_matches[0] + for match_org, match_repo, _ in pr_matches[1:]: + if match_org != org or match_repo != repo: + return None + + pr_numbers = [m[2] for m in pr_matches] - # Extract version from "compare with X.XX" pattern - REQUIRED version_pattern = r"compare\s+with\s+(\d+\.\d+)" version_match = re.search(version_pattern, text, re.IGNORECASE) if not version_match: - # Version is required - return None to trigger validation error return None version = version_match.group(1) - return org, repo, pr_number, version + return org, repo, pr_numbers, version async def analyze_pr_with_gemini(text: str, channel_id: str = None) -> dict: @@ -117,9 +121,11 @@ async def analyze_pr_with_gemini(text: str, channel_id: str = None) -> dict: ), ) - org, repo, pr_number, version = parsed + org, repo, pr_numbers, version = parsed + pr_display = ", ".join(f"#{pr}" for pr in pr_numbers) logger.info( - f"🔍 PR analysis requested for {org}/{repo}/pull/{pr_number} (OpenShift {version})" + "PR analysis requested for %s/%s %s (OpenShift %s)", + org, repo, pr_display, version, ) # Ensure MCP client is initialized @@ -132,19 +138,23 @@ async def analyze_pr_with_gemini(text: str, channel_id: str = None) -> dict: try: logger.info( - "Starting PR performance analysis: %s/%s#%s (OpenShift %s)", + "Starting PR performance analysis: %s/%s %s (OpenShift %s)", org, repo, - pr_number, + pr_display, version, ) # Create prompt for PR analysis using centralized prompts - pr_url = f"https://github.com/{org}/{repo}/pull/{pr_number}" + pr_urls = ", ".join( + f"https://github.com/{org}/{repo}/pull/{pr}" for pr in pr_numbers + ) + pr_numbers_str = ",".join(pr_numbers) system_prompt = PR_PERFORMANCE_ANALYSIS_PROMPT["system"] user_prompt = PR_PERFORMANCE_ANALYSIS_PROMPT["user"].format( - org=org, repo=repo, pr_number=pr_number, pr_url=pr_url, version=version + org=org, repo=repo, pr_numbers=pr_numbers_str, + pr_urls=pr_urls, version=version, ) assistant_prompt = PR_PERFORMANCE_ANALYSIS_PROMPT["assistant"] @@ -184,22 +194,22 @@ async def analyze_pr_with_gemini(text: str, channel_id: str = None) -> dict: # Also check if the result is very short (likely just a "no data" message) if len(result.strip()) < 200: logger.info( - "No performance test data found for PR %s/%s#%s", + "No performance test data found for PR %s/%s %s", org, repo, - pr_number, + pr_display, ) return make_response( success=True, message=( - f"No performance test results found for PR #{pr_number}\n\n" + f"No performance test results found for PR {pr_display}\n\n" f"This could mean:\n" f"- Performance tests haven't run yet for this PR\n" f"- The PR doesn't trigger performance test jobs\n" f"- Test results are not yet available in the Orion database\n\n" f"Check back later or verify that performance tests are configured for this repository." ), - pr_info=(org, repo, pr_number, version), + pr_info=(org, repo, pr_numbers, version), ) logger.info("PR analysis completed successfully (%d chars)", len(result)) @@ -210,7 +220,7 @@ async def analyze_pr_with_gemini(text: str, channel_id: str = None) -> dict: return make_response( success=True, message=sanitized_result, - pr_info=(org, repo, pr_number, version), + pr_info=(org, repo, pr_numbers, version), ) except Exception as e: diff --git a/bugzooka/analysis/prompts.py b/bugzooka/analysis/prompts.py index 7848b2d..d37c4ef 100644 --- a/bugzooka/analysis/prompts.py +++ b/bugzooka/analysis/prompts.py @@ -75,29 +75,39 @@ 5. **Sort All Metrics**: ALWAYS sort metrics by absolute percentage change (highest to lowest) in all tables and lists. 6. **Format Output**: Use Slack-friendly formatting as specified in user instructions. """, - "user": """Please analyze the performance of this pull request: + "user": """Please analyze the performance of the following pull request(s): - Organization: {org} - Repository: {repo} -- Pull Request Number: {pr_number} -- PR URL: {pr_url} +- Pull Request Number(s): {pr_numbers} +- PR URL(s): {pr_urls} - OpenShift Version: {version} +**Tool Calling Instructions:** +- If analyzing a SINGLE PR, call `openshift_report_on_pr` with the `pull_request` parameter set to the PR number. +- If analyzing MULTIPLE PRs, call `openshift_report_on_pr` with the `pull_requests` parameter set to a comma-separated string of PR numbers (e.g. "3169,3170"). The tool returns results for each PR under the `pulls` key. + **Required Output Structure:** Output ONLY the sections below with ABSOLUTELY NO additional commentary, thinking process, or meta-commentary. *Performance Impact Assessment* -- Overall Impact: State EXACTLY one of: ":exclamation: *Regression* :exclamation:" (only if 1 or more significant regression found), ":rocket: *Improvement* :rocket:" (only if 1 or more significant improvement found), ":arrow_right: *Neutral* :arrow_right:" (no significant changes) -- Significant regressions (≥10%): List with 🛑 emoji, metric name and short config name, grouped by config. ONLY include if |change| >= 10% AND classified as regression. Do not use bold font, omit section entirely if none found. -- Significant improvements (≥10%): List with 🚀 emoji, metric name and short config name, grouped by config. ONLY include if |change| >= 10% AND classified as improvement. Do not use bold font, omit section entirely if none found. -- Moderate regressions (5-10%): List with ⚠️ emoji, metric name and short config name, grouped by config. ONLY include if 5% <= |change| < 10% AND classified as regression. Do not use bold font, omit section entirely if none found. -- Moderate improvements (5-10%): List with ✅ emoji, metric name and short config name, grouped by config. ONLY include if 5% <= |change| < 10% AND classified as improvement. Do not use bold font, omit section entirely if none found. +- For SINGLE PR: show one assessment section. +- For MULTIPLE PRs: show a SEPARATE assessment section per PR (e.g. "*PR #3169*" then "*PR #3170*"). Each PR is independently compared against the periodic baseline — do NOT compare PRs against each other. +- Per-PR assessment structure: + - Overall Impact: State EXACTLY one of: ":exclamation: *Regression* :exclamation:" (only if 1 or more significant regression found), ":rocket: *Improvement* :rocket:" (only if 1 or more significant improvement found), ":arrow_right: *Neutral* :arrow_right:" (no significant changes) + - Significant regressions (≥10%): List with 🛑 emoji, metric name, grouped by config. ONLY include if |change| >= 10% AND classified as regression. Do not use bold font, omit section entirely if none found. + - Significant improvements (≥10%): List with 🚀 emoji, metric name, grouped by config. ONLY include if |change| >= 10% AND classified as improvement. Do not use bold font, omit section entirely if none found. + - Moderate regressions (5-10%): List with ⚠️ emoji, metric name, grouped by config. ONLY include if 5% <= |change| < 10% AND classified as regression. Do not use bold font, omit section entirely if none found. + - Moderate improvements (5-10%): List with ✅ emoji, metric name, grouped by config. ONLY include if 5% <= |change| < 10% AND classified as improvement. Do not use bold font, omit section entirely if none found. - End this section with a line of 80 equals signs. *ONLY IF SIGNIFICANT REGRESSION IS FOUND, INCLUDE THE FOLLOWING SECTION* *Regression Analysis*: -1. Root Cause: Identify the most likely cause of the significant regression. Be as specific as possible. -2. Impact: Describe the impact of the significant regression on the system. -3. Recommendations: Suggest corrective actions to address the significant regression. +- For SINGLE PR: one regression analysis section. +- For MULTIPLE PRs: a SEPARATE regression analysis per PR that has significant regressions. Label each with the PR number (e.g. "*Regression Analysis (PR #3169)*"). +- Per-PR regression analysis structure: + 1. Root Cause: Identify the most likely cause of the significant regression. Be as specific as possible. + 2. Impact: Describe the impact of the significant regression on the system. + 3. Recommendations: Suggest corrective actions to address the significant regression. End this section with a line of 80 equals signs. *Most Impacted Metrics* @@ -105,7 +115,10 @@ - Transform config name to readable format: "/orion/examples/trt-external-payload-cluster-density.yaml" → "cluster-density" - Table header: e.g. *Config: cluster-density* - MANDATORY: Include ONLY top 10 metrics sorted by absolute percentage change (highest impact first) -- Columns: Metric | Baseline | PR Value | Change (%) +- For SINGLE PR — Columns: Metric | Baseline | PR Value | Change (%) +- For MULTIPLE PRs — use a single combined table with all PRs side by side: + Columns: Metric | Baseline | PR #X Value | PR #X Change(%) | PR #Y Value | PR #Y Change(%) | ... + This lets the user compare PRs at a glance without scrolling between separate tables. - Format tables with `code` blocks, adjust column widths to fit data - No emojis in tables - Separate each config section with 80 equals signs. diff --git a/bugzooka/integrations/slack_socket_listener.py b/bugzooka/integrations/slack_socket_listener.py index 9fc7519..c2d449f 100644 --- a/bugzooka/integrations/slack_socket_listener.py +++ b/bugzooka/integrations/slack_socket_listener.py @@ -168,10 +168,12 @@ def _process_mention(self, event: Dict[str, Any]) -> None: ) if analysis_result["success"]: - org, repo, pr_number, version = analysis_result["pr_info"] + org, repo, pr_numbers, version = analysis_result["pr_info"] _pr_repo = f"{org}/{repo}" + pr_display = ", ".join(f"#{pr}" for pr in pr_numbers) self.logger.info( - f"✅ Sent PR analysis for {org}/{repo}#{pr_number} (OpenShift {version}) to {user}" + "Sent PR analysis for %s/%s %s (OpenShift %s) to %s", + org, repo, pr_display, version, user, ) else: self.logger.warning( diff --git a/tests/test_pr_analyzer.py b/tests/test_pr_analyzer.py new file mode 100644 index 0000000..655f8f8 --- /dev/null +++ b/tests/test_pr_analyzer.py @@ -0,0 +1,79 @@ +""" +Tests for PR analysis request parsing and output sanitization. +""" +from bugzooka.analysis.pr_analyzer import _parse_pr_request, _sanitize_gemini_output + + +class TestParsePrRequest: + """Tests for _parse_pr_request() — single and multi-PR parsing.""" + + def test_single_pr(self): + text = "analyze pr: https://github.com/openshift/ovn-kubernetes/pull/3169, compare with 5.0" + result = _parse_pr_request(text) + assert result == ("openshift", "ovn-kubernetes", ["3169"], "5.0") + + def test_multi_pr_space_separated(self): + text = ( + "analyze pr: https://github.com/openshift/ovn-kubernetes/pull/3169 " + "https://github.com/openshift/ovn-kubernetes/pull/3170, compare with 5.0" + ) + result = _parse_pr_request(text) + assert result == ("openshift", "ovn-kubernetes", ["3169", "3170"], "5.0") + + def test_multi_pr_three_prs(self): + text = ( + "analyze pr: https://github.com/openshift/ovn-kubernetes/pull/100 " + "https://github.com/openshift/ovn-kubernetes/pull/200 " + "https://github.com/openshift/ovn-kubernetes/pull/300, compare with 4.19" + ) + result = _parse_pr_request(text) + assert result == ("openshift", "ovn-kubernetes", ["100", "200", "300"], "4.19") + + def test_mixed_repos_returns_none(self): + text = ( + "analyze pr: https://github.com/openshift/ovn-kubernetes/pull/3169 " + "https://github.com/openshift/installer/pull/100, compare with 5.0" + ) + assert _parse_pr_request(text) is None + + def test_mixed_orgs_returns_none(self): + text = ( + "analyze pr: https://github.com/openshift/ovn-kubernetes/pull/3169 " + "https://github.com/other-org/ovn-kubernetes/pull/100, compare with 5.0" + ) + assert _parse_pr_request(text) is None + + def test_no_version_returns_none(self): + text = "analyze pr: https://github.com/openshift/ovn-kubernetes/pull/3169" + assert _parse_pr_request(text) is None + + def test_no_url_returns_none(self): + text = "analyze pr: some random text, compare with 5.0" + assert _parse_pr_request(text) is None + + def test_version_case_insensitive(self): + text = "analyze pr: https://github.com/openshift/ovn-kubernetes/pull/3169, Compare With 4.19" + result = _parse_pr_request(text) + assert result == ("openshift", "ovn-kubernetes", ["3169"], "4.19") + + def test_http_url(self): + text = "analyze pr: http://github.com/openshift/ovn-kubernetes/pull/3169, compare with 5.0" + result = _parse_pr_request(text) + assert result == ("openshift", "ovn-kubernetes", ["3169"], "5.0") + + +class TestSanitizeGeminiOutput: + """Tests for _sanitize_gemini_output().""" + + def test_removes_thinking_before_marker(self): + result = "Let me think about this...\n*Performance Impact Assessment*\nActual content" + sanitized = _sanitize_gemini_output(result) + assert sanitized == "*Performance Impact Assessment*\nActual content" + + def test_no_marker_returns_original(self): + result = "Some output without the marker" + assert _sanitize_gemini_output(result) == result + + def test_marker_at_start_returns_unchanged(self): + result = "*Performance Impact Assessment*\nContent here" + assert _sanitize_gemini_output(result) == result