diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 864d864..c2141c0 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -108,3 +108,4 @@ user-study/images @StevenSong
user-study/metadata.csv @StevenSong
user-study/prepare_samples.ipynb @StevenSong
user-study/results.csv @StevenSong
+user-study/user_study_analysis.ipynb @sahilsethi0105
diff --git a/user-study/user_study_analysis.ipynb b/user-study/user_study_analysis.ipynb
new file mode 100644
index 0000000..21d249f
--- /dev/null
+++ b/user-study/user_study_analysis.ipynb
@@ -0,0 +1,2176 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "a8401e54",
+ "metadata": {},
+ "source": [
+ "# ProtoSSL User study analysis\n",
+ "\n",
+ "This notebook conducts:\n",
+ "\n",
+ "- **Primary analysis:** participant-level paired comparison of the proportion of responses rated as good for ProtoSSL vs ProtoECGNet, done separately for the two tasks.\n",
+ "- **Primary test:** two-sided **Wilcoxon signed-rank test** across participants.\n",
+ "- **Comparative A/B/Both/Neither question:** descriptive summaries\n",
+ "- **Inter-rater agreement:** **Fleiss' kappa** for the binary yes/no ratings, reported overall and by label.\n",
+ "\n",
+ "The **participants** are the primary unit of inference.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "73ba5be4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from scipy.stats import wilcoxon, ttest_rel, binomtest, t as tdist\n",
+ "from statsmodels.stats.inter_rater import fleiss_kappa\n",
+ "\n",
+ "pd.set_option(\"display.max_columns\", None)\n",
+ "pd.set_option(\"display.width\", 200)\n",
+ "pd.set_option(\"display.float_format\", lambda x: f\"{x:.4f}\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "d094a664",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results shape: (7, 131)\n",
+ "metadata shape: (20, 15)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " record_id | \n",
+ " redcap_survey_identifier | \n",
+ " user_study_form_timestamp | \n",
+ " consent | \n",
+ " prototypes_quality_choices | \n",
+ " prototypea_quality | \n",
+ " prototypeb_quality | \n",
+ " prototypes_quality_choices_2 | \n",
+ " explanation_a | \n",
+ " explanation_b | \n",
+ " case1_prototypes_quality_choices | \n",
+ " case1_prototypea_quality | \n",
+ " case1_prototypeb_quality | \n",
+ " case1_prototypes_quality_choices_2 | \n",
+ " case1_explanation_a | \n",
+ " case1_explanation_b | \n",
+ " case2_prototypes_quality_choices | \n",
+ " case2_prototypea_quality | \n",
+ " case2_prototypeb_quality | \n",
+ " case2_prototypes_quality_choices_2 | \n",
+ " case2_explanation_a | \n",
+ " case2_explanation_b | \n",
+ " case3_prototypes_quality_choices | \n",
+ " case3_prototypea_quality | \n",
+ " case3_prototypeb_quality | \n",
+ " case3_prototypes_quality_choices_2 | \n",
+ " case3_explanation_a | \n",
+ " case3_explanation_b | \n",
+ " case4_prototypes_quality_choices | \n",
+ " case4_prototypea_quality | \n",
+ " case4_prototypeb_quality | \n",
+ " case4_prototypes_quality_choices_2 | \n",
+ " case4_explanation_a | \n",
+ " case4_explanation_b | \n",
+ " case5_prototypes_quality_choices | \n",
+ " case5_prototypea_quality | \n",
+ " case5_prototypeb_quality | \n",
+ " case5_prototypes_quality_choices_2 | \n",
+ " case5_explanation_a | \n",
+ " case5_explanation_b | \n",
+ " case6_prototypes_quality_choices | \n",
+ " case6_prototypea_quality | \n",
+ " case6_prototypeb_quality | \n",
+ " case6_prototypes_quality_choices_2 | \n",
+ " case6_explanation_a | \n",
+ " case6_explanation_b | \n",
+ " case7_prototypes_quality_choices | \n",
+ " case7_prototypea_quality | \n",
+ " case7_prototypeb_quality | \n",
+ " case7_prototypes_quality_choices_2 | \n",
+ " case7_explanation_a | \n",
+ " case7_explanation_b | \n",
+ " case8_prototypes_quality_choices | \n",
+ " case8_prototypea_quality | \n",
+ " case8_prototypeb_quality | \n",
+ " case8_prototypes_quality_choices_2 | \n",
+ " case8_explanation_a | \n",
+ " case8_explanation_b | \n",
+ " case9_prototypes_quality_choices | \n",
+ " case9_prototypea_quality | \n",
+ " case9_prototypeb_quality | \n",
+ " case9_prototypes_quality_choices_2 | \n",
+ " case9_explanation_a | \n",
+ " case9_explanation_b | \n",
+ " case10_prototypes_quality_choices | \n",
+ " case10_prototypea_quality | \n",
+ " case10_prototypeb_quality | \n",
+ " case10_prototypes_quality_choices_2 | \n",
+ " case10_explanation_a | \n",
+ " case10_explanation_b | \n",
+ " case11_prototypes_quality_choices | \n",
+ " case11_prototypea_quality | \n",
+ " case11_prototypeb_quality | \n",
+ " case11_prototypes_quality_choices_2 | \n",
+ " case11_explanation_a | \n",
+ " case11_explanation_b | \n",
+ " case12_prototypes_quality_choices | \n",
+ " case12_prototypea_quality | \n",
+ " case12_prototypeb_quality | \n",
+ " case12_prototypes_quality_choices_2 | \n",
+ " case12_explanation_a | \n",
+ " case12_explanation_b | \n",
+ " case13_prototypes_quality_choices | \n",
+ " case13_prototypea_quality | \n",
+ " case13_prototypeb_quality | \n",
+ " case13_prototypes_quality_choices_2 | \n",
+ " case13_explanation_a | \n",
+ " case13_explanation_b | \n",
+ " case14_prototypes_quality_choices | \n",
+ " case14_prototypea_quality | \n",
+ " case14_prototypeb_quality | \n",
+ " case14_prototypes_quality_choices_2 | \n",
+ " case14_explanation_a | \n",
+ " case14_explanation_b | \n",
+ " case15_prototypes_quality_choices | \n",
+ " case15_prototypea_quality | \n",
+ " case15_prototypeb_quality | \n",
+ " case15_prototypes_quality_choices_2 | \n",
+ " case15_explanation_a | \n",
+ " case15_explanation_b | \n",
+ " case16_prototypes_quality_choices | \n",
+ " case16_prototypea_quality | \n",
+ " case16_prototypeb_quality | \n",
+ " case16_prototypes_quality_choices_2 | \n",
+ " case16_explanation_a | \n",
+ " case16_explanation_b | \n",
+ " case17_prototypes_quality_choices | \n",
+ " case17_prototypea_quality | \n",
+ " case17_prototypeb_quality | \n",
+ " case17_prototypes_quality_choices_2 | \n",
+ " case17_explanation_a | \n",
+ " case17_explanation_b | \n",
+ " case18_prototypes_quality_choices | \n",
+ " case18_prototypea_quality | \n",
+ " case18_prototypeb_quality | \n",
+ " case18_prototypes_quality_choices_2 | \n",
+ " case18_explanation_a | \n",
+ " case18_explanation_b | \n",
+ " case19_prototypes_quality_choices | \n",
+ " case19_prototypea_quality | \n",
+ " case19_prototypeb_quality | \n",
+ " case19_prototypes_quality_choices_2 | \n",
+ " case19_explanation_a | \n",
+ " case19_explanation_b | \n",
+ " case20_prototypes_quality_choices | \n",
+ " case20_prototypea_quality | \n",
+ " case20_prototypeb_quality | \n",
+ " case20_prototypes_quality_choices_2 | \n",
+ " case20_explanation_a | \n",
+ " case20_explanation_b | \n",
+ " user_study_form_complete | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 5 | \n",
+ " NaN | \n",
+ " 2026-04-06 18:18:56 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 7 | \n",
+ " NaN | \n",
+ " 2026-04-09 11:26:53 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " record_id redcap_survey_identifier user_study_form_timestamp consent prototypes_quality_choices prototypea_quality prototypeb_quality prototypes_quality_choices_2 explanation_a \\\n",
+ "0 5 NaN 2026-04-06 18:18:56 1 2 0 1 2 0 \n",
+ "1 7 NaN 2026-04-09 11:26:53 1 2 0 1 2 0 \n",
+ "\n",
+ " explanation_b case1_prototypes_quality_choices case1_prototypea_quality case1_prototypeb_quality case1_prototypes_quality_choices_2 case1_explanation_a case1_explanation_b \\\n",
+ "0 1 3 1 1 3 1 1 \n",
+ "1 1 3 1 1 1 1 1 \n",
+ "\n",
+ " case2_prototypes_quality_choices case2_prototypea_quality case2_prototypeb_quality case2_prototypes_quality_choices_2 case2_explanation_a case2_explanation_b \\\n",
+ "0 1 1 1 1 1 1 \n",
+ "1 3 1 1 3 1 1 \n",
+ "\n",
+ " case3_prototypes_quality_choices case3_prototypea_quality case3_prototypeb_quality case3_prototypes_quality_choices_2 case3_explanation_a case3_explanation_b \\\n",
+ "0 2 1 1 2 1 1 \n",
+ "1 3 0 0 2 0 1 \n",
+ "\n",
+ " case4_prototypes_quality_choices case4_prototypea_quality case4_prototypeb_quality case4_prototypes_quality_choices_2 case4_explanation_a case4_explanation_b \\\n",
+ "0 3 1 1 1 1 1 \n",
+ "1 1 1 0 1 1 0 \n",
+ "\n",
+ " case5_prototypes_quality_choices case5_prototypea_quality case5_prototypeb_quality case5_prototypes_quality_choices_2 case5_explanation_a case5_explanation_b \\\n",
+ "0 3 1 1 3 1 1 \n",
+ "1 3 1 1 3 1 1 \n",
+ "\n",
+ " case6_prototypes_quality_choices case6_prototypea_quality case6_prototypeb_quality case6_prototypes_quality_choices_2 case6_explanation_a case6_explanation_b \\\n",
+ "0 3 1 1 3 1 1 \n",
+ "1 1 1 0 1 1 0 \n",
+ "\n",
+ " case7_prototypes_quality_choices case7_prototypea_quality case7_prototypeb_quality case7_prototypes_quality_choices_2 case7_explanation_a case7_explanation_b \\\n",
+ "0 3 1 1 3 1 1 \n",
+ "1 1 1 1 1 1 0 \n",
+ "\n",
+ " case8_prototypes_quality_choices case8_prototypea_quality case8_prototypeb_quality case8_prototypes_quality_choices_2 case8_explanation_a case8_explanation_b \\\n",
+ "0 3 1 1 3 1 1 \n",
+ "1 3 1 1 3 1 1 \n",
+ "\n",
+ " case9_prototypes_quality_choices case9_prototypea_quality case9_prototypeb_quality case9_prototypes_quality_choices_2 case9_explanation_a case9_explanation_b \\\n",
+ "0 3 1 1 1 1 1 \n",
+ "1 3 1 1 1 1 0 \n",
+ "\n",
+ " case10_prototypes_quality_choices case10_prototypea_quality case10_prototypeb_quality case10_prototypes_quality_choices_2 case10_explanation_a case10_explanation_b \\\n",
+ "0 2 1 1 2 1 1 \n",
+ "1 3 1 1 2 1 0 \n",
+ "\n",
+ " case11_prototypes_quality_choices case11_prototypea_quality case11_prototypeb_quality case11_prototypes_quality_choices_2 case11_explanation_a case11_explanation_b \\\n",
+ "0 2 1 1 3 1 1 \n",
+ "1 1 1 0 1 1 0 \n",
+ "\n",
+ " case12_prototypes_quality_choices case12_prototypea_quality case12_prototypeb_quality case12_prototypes_quality_choices_2 case12_explanation_a case12_explanation_b \\\n",
+ "0 2 1 1 3 1 1 \n",
+ "1 2 1 1 3 1 1 \n",
+ "\n",
+ " case13_prototypes_quality_choices case13_prototypea_quality case13_prototypeb_quality case13_prototypes_quality_choices_2 case13_explanation_a case13_explanation_b \\\n",
+ "0 3 1 1 3 1 1 \n",
+ "1 2 0 1 3 1 1 \n",
+ "\n",
+ " case14_prototypes_quality_choices case14_prototypea_quality case14_prototypeb_quality case14_prototypes_quality_choices_2 case14_explanation_a case14_explanation_b \\\n",
+ "0 3 1 1 3 1 1 \n",
+ "1 3 1 1 1 1 1 \n",
+ "\n",
+ " case15_prototypes_quality_choices case15_prototypea_quality case15_prototypeb_quality case15_prototypes_quality_choices_2 case15_explanation_a case15_explanation_b \\\n",
+ "0 3 1 1 3 1 1 \n",
+ "1 3 1 1 2 1 1 \n",
+ "\n",
+ " case16_prototypes_quality_choices case16_prototypea_quality case16_prototypeb_quality case16_prototypes_quality_choices_2 case16_explanation_a case16_explanation_b \\\n",
+ "0 2 0 1 2 0 1 \n",
+ "1 2 0 1 2 0 1 \n",
+ "\n",
+ " case17_prototypes_quality_choices case17_prototypea_quality case17_prototypeb_quality case17_prototypes_quality_choices_2 case17_explanation_a case17_explanation_b \\\n",
+ "0 3 1 1 1 1 1 \n",
+ "1 1 1 0 1 1 0 \n",
+ "\n",
+ " case18_prototypes_quality_choices case18_prototypea_quality case18_prototypeb_quality case18_prototypes_quality_choices_2 case18_explanation_a case18_explanation_b \\\n",
+ "0 2 0 1 2 0 1 \n",
+ "1 2 0 1 2 0 1 \n",
+ "\n",
+ " case19_prototypes_quality_choices case19_prototypea_quality case19_prototypeb_quality case19_prototypes_quality_choices_2 case19_explanation_a case19_explanation_b \\\n",
+ "0 3 1 1 3 1 1 \n",
+ "1 1 1 1 1 1 0 \n",
+ "\n",
+ " case20_prototypes_quality_choices case20_prototypea_quality case20_prototypeb_quality case20_prototypes_quality_choices_2 case20_explanation_a case20_explanation_b user_study_form_complete \n",
+ "0 3 1 1 2 1 1 2 \n",
+ "1 2 0 1 2 0 1 2 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results = pd.read_csv(\"results.csv\")\n",
+ "metadata = pd.read_csv(\"metadata.csv\")\n",
+ "\n",
+ "print(\"results shape:\", results.shape)\n",
+ "print(\"metadata shape:\", metadata.shape)\n",
+ "results.head(2)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f870f0f",
+ "metadata": {},
+ "source": [
+ "### Decode REDCap responses into analysis tables\n",
+ "\n",
+ "`yesno` contains one row per participant × case × task × model for the binary yes/no questions.\n",
+ "\n",
+ "`prefs` contains one row per participant × case × task for the A/B/Both/Neither comparative question, decoded back to the actual model identities using the metadata file.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "16acb6b2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "yesno shape: (560, 6)\n",
+ "prefs shape: (280, 5)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " participant | \n",
+ " case_id | \n",
+ " label | \n",
+ " task | \n",
+ " model | \n",
+ " good | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " AMI | \n",
+ " global | \n",
+ " ProtoSSL | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " AMI | \n",
+ " global | \n",
+ " ProtoECGNet | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " AMI | \n",
+ " paired | \n",
+ " ProtoSSL | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " AMI | \n",
+ " paired | \n",
+ " ProtoECGNet | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 2 | \n",
+ " AMI | \n",
+ " global | \n",
+ " ProtoSSL | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " participant case_id label task model good\n",
+ "0 5 1 AMI global ProtoSSL 1\n",
+ "1 5 1 AMI global ProtoECGNet 1\n",
+ "2 5 1 AMI paired ProtoSSL 1\n",
+ "3 5 1 AMI paired ProtoECGNet 1\n",
+ "4 5 2 AMI global ProtoSSL 1"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "case_map = (\n",
+ " metadata.rename(\n",
+ " columns={\n",
+ " \"Study Index\": \"case_id\",\n",
+ " \"Label\": \"label\",\n",
+ " \"ProtoSSL Assignment\": \"ssl_assignment\",\n",
+ " \"ProtoECGNet Assignment\": \"ecg_assignment\",\n",
+ " }\n",
+ " )[[\"case_id\", \"label\", \"ssl_assignment\", \"ecg_assignment\"]]\n",
+ " .copy()\n",
+ ")\n",
+ "case_map[\"case_id\"] = case_map[\"case_id\"].astype(int)\n",
+ "\n",
+ "pref_code = {1: \"A\", 2: \"B\", 3: \"Both\", 4: \"Neither\"}\n",
+ "\n",
+ "yes_rows = []\n",
+ "pref_rows = []\n",
+ "\n",
+ "for _, row in results.iterrows():\n",
+ " participant = int(row[\"record_id\"])\n",
+ " for case_id in range(1, 21):\n",
+ " meta_row = case_map.loc[case_map[\"case_id\"] == case_id].iloc[0]\n",
+ "\n",
+ " task_specs = [\n",
+ " (\"global\",\n",
+ " f\"case{case_id}_prototypea_quality\",\n",
+ " f\"case{case_id}_prototypeb_quality\",\n",
+ " f\"case{case_id}_prototypes_quality_choices\"),\n",
+ " (\"paired\",\n",
+ " f\"case{case_id}_explanation_a\",\n",
+ " f\"case{case_id}_explanation_b\",\n",
+ " f\"case{case_id}_prototypes_quality_choices_2\"),\n",
+ " ]\n",
+ "\n",
+ " for task, a_col, b_col, pref_col in task_specs:\n",
+ " for shown_letter, col in [(\"A\", a_col), (\"B\", b_col)]:\n",
+ " actual_model = \"ProtoSSL\" if meta_row[\"ssl_assignment\"] == shown_letter else \"ProtoECGNet\"\n",
+ " yes_rows.append(\n",
+ " {\n",
+ " \"participant\": participant,\n",
+ " \"case_id\": case_id,\n",
+ " \"label\": meta_row[\"label\"],\n",
+ " \"task\": task,\n",
+ " \"model\": actual_model,\n",
+ " \"good\": int(row[col]),\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " pref_value = pref_code[int(row[pref_col])]\n",
+ " if pref_value in [\"A\", \"B\"]:\n",
+ " actual_pref = \"ProtoSSL\" if meta_row[\"ssl_assignment\"] == pref_value else \"ProtoECGNet\"\n",
+ " else:\n",
+ " actual_pref = pref_value\n",
+ "\n",
+ " pref_rows.append(\n",
+ " {\n",
+ " \"participant\": participant,\n",
+ " \"case_id\": case_id,\n",
+ " \"label\": meta_row[\"label\"],\n",
+ " \"task\": task,\n",
+ " \"preference\": actual_pref,\n",
+ " }\n",
+ " )\n",
+ "\n",
+ "yesno = pd.DataFrame(yes_rows)\n",
+ "prefs = pd.DataFrame(pref_rows)\n",
+ "\n",
+ "print(\"yesno shape:\", yesno.shape)\n",
+ "print(\"prefs shape:\", prefs.shape)\n",
+ "yesno.head()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0dce4296",
+ "metadata": {},
+ "source": [
+ "### Descriptive summaries for the binary yes/no questions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "961b8054",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " task | \n",
+ " model | \n",
+ " n_yes | \n",
+ " n_total | \n",
+ " proportion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " global | \n",
+ " ProtoECGNet | \n",
+ " 93 | \n",
+ " 140 | \n",
+ " 0.6643 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " global | \n",
+ " ProtoSSL | \n",
+ " 128 | \n",
+ " 140 | \n",
+ " 0.9143 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " paired | \n",
+ " ProtoECGNet | \n",
+ " 95 | \n",
+ " 140 | \n",
+ " 0.6786 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " paired | \n",
+ " ProtoSSL | \n",
+ " 116 | \n",
+ " 140 | \n",
+ " 0.8286 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " task model n_yes n_total proportion\n",
+ "0 global ProtoECGNet 93 140 0.6643\n",
+ "1 global ProtoSSL 128 140 0.9143\n",
+ "2 paired ProtoECGNet 95 140 0.6786\n",
+ "3 paired ProtoSSL 116 140 0.8286"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "overall_yesno = (\n",
+ " yesno.groupby([\"task\", \"model\"])[\"good\"]\n",
+ " .agg(n_yes=\"sum\", n_total=\"count\", proportion=\"mean\")\n",
+ " .reset_index()\n",
+ ")\n",
+ "overall_yesno\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "da09502f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " task | \n",
+ " label | \n",
+ " model | \n",
+ " n_yes | \n",
+ " n_total | \n",
+ " proportion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " global | \n",
+ " AMI | \n",
+ " ProtoECGNet | \n",
+ " 23 | \n",
+ " 35 | \n",
+ " 0.6571 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " global | \n",
+ " AMI | \n",
+ " ProtoSSL | \n",
+ " 30 | \n",
+ " 35 | \n",
+ " 0.8571 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " global | \n",
+ " CLBBB | \n",
+ " ProtoECGNet | \n",
+ " 33 | \n",
+ " 35 | \n",
+ " 0.9429 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " global | \n",
+ " CLBBB | \n",
+ " ProtoSSL | \n",
+ " 32 | \n",
+ " 35 | \n",
+ " 0.9143 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " global | \n",
+ " CRBBB | \n",
+ " ProtoECGNet | \n",
+ " 19 | \n",
+ " 35 | \n",
+ " 0.5429 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " global | \n",
+ " CRBBB | \n",
+ " ProtoSSL | \n",
+ " 32 | \n",
+ " 35 | \n",
+ " 0.9143 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " global | \n",
+ " PVC | \n",
+ " ProtoECGNet | \n",
+ " 18 | \n",
+ " 35 | \n",
+ " 0.5143 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " global | \n",
+ " PVC | \n",
+ " ProtoSSL | \n",
+ " 34 | \n",
+ " 35 | \n",
+ " 0.9714 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " paired | \n",
+ " AMI | \n",
+ " ProtoECGNet | \n",
+ " 23 | \n",
+ " 35 | \n",
+ " 0.6571 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " paired | \n",
+ " AMI | \n",
+ " ProtoSSL | \n",
+ " 28 | \n",
+ " 35 | \n",
+ " 0.8000 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " paired | \n",
+ " CLBBB | \n",
+ " ProtoECGNet | \n",
+ " 33 | \n",
+ " 35 | \n",
+ " 0.9429 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " paired | \n",
+ " CLBBB | \n",
+ " ProtoSSL | \n",
+ " 27 | \n",
+ " 35 | \n",
+ " 0.7714 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " paired | \n",
+ " CRBBB | \n",
+ " ProtoECGNet | \n",
+ " 19 | \n",
+ " 35 | \n",
+ " 0.5429 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " paired | \n",
+ " CRBBB | \n",
+ " ProtoSSL | \n",
+ " 30 | \n",
+ " 35 | \n",
+ " 0.8571 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " paired | \n",
+ " PVC | \n",
+ " ProtoECGNet | \n",
+ " 20 | \n",
+ " 35 | \n",
+ " 0.5714 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " paired | \n",
+ " PVC | \n",
+ " ProtoSSL | \n",
+ " 31 | \n",
+ " 35 | \n",
+ " 0.8857 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " task label model n_yes n_total proportion\n",
+ "0 global AMI ProtoECGNet 23 35 0.6571\n",
+ "1 global AMI ProtoSSL 30 35 0.8571\n",
+ "2 global CLBBB ProtoECGNet 33 35 0.9429\n",
+ "3 global CLBBB ProtoSSL 32 35 0.9143\n",
+ "4 global CRBBB ProtoECGNet 19 35 0.5429\n",
+ "5 global CRBBB ProtoSSL 32 35 0.9143\n",
+ "6 global PVC ProtoECGNet 18 35 0.5143\n",
+ "7 global PVC ProtoSSL 34 35 0.9714\n",
+ "8 paired AMI ProtoECGNet 23 35 0.6571\n",
+ "9 paired AMI ProtoSSL 28 35 0.8000\n",
+ "10 paired CLBBB ProtoECGNet 33 35 0.9429\n",
+ "11 paired CLBBB ProtoSSL 27 35 0.7714\n",
+ "12 paired CRBBB ProtoECGNet 19 35 0.5429\n",
+ "13 paired CRBBB ProtoSSL 30 35 0.8571\n",
+ "14 paired PVC ProtoECGNet 20 35 0.5714\n",
+ "15 paired PVC ProtoSSL 31 35 0.8857"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "label_yesno = (\n",
+ " yesno.groupby([\"task\", \"label\", \"model\"])[\"good\"]\n",
+ " .agg(n_yes=\"sum\", n_total=\"count\", proportion=\"mean\")\n",
+ " .reset_index()\n",
+ ")\n",
+ "label_yesno\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "03a50af7",
+ "metadata": {},
+ "source": [
+ "### Primary analysis\n",
+ "\n",
+ "For each participant and each task, compute the proportion of responses rated as good for each model across the 20 cases. Then compare ProtoSSL vs ProtoECGNet with a **paired Wilcoxon signed-rank test**.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "3194ed44",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | model | \n",
+ " participant | \n",
+ " task | \n",
+ " ProtoECGNet | \n",
+ " ProtoSSL | \n",
+ " difference | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 5 | \n",
+ " global | \n",
+ " 0.9000 | \n",
+ " 1.0000 | \n",
+ " 0.1000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 5 | \n",
+ " paired | \n",
+ " 0.9000 | \n",
+ " 1.0000 | \n",
+ " 0.1000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 7 | \n",
+ " global | \n",
+ " 0.7000 | \n",
+ " 0.8000 | \n",
+ " 0.1000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 7 | \n",
+ " paired | \n",
+ " 0.6000 | \n",
+ " 0.8000 | \n",
+ " 0.2000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 8 | \n",
+ " global | \n",
+ " 0.4500 | \n",
+ " 0.8000 | \n",
+ " 0.3500 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 8 | \n",
+ " paired | \n",
+ " 0.5000 | \n",
+ " 0.5500 | \n",
+ " 0.0500 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 9 | \n",
+ " global | \n",
+ " 0.6000 | \n",
+ " 0.9500 | \n",
+ " 0.3500 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 9 | \n",
+ " paired | \n",
+ " 0.5500 | \n",
+ " 0.9000 | \n",
+ " 0.3500 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 10 | \n",
+ " global | \n",
+ " 0.6000 | \n",
+ " 0.9500 | \n",
+ " 0.3500 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 10 | \n",
+ " paired | \n",
+ " 0.7500 | \n",
+ " 0.9500 | \n",
+ " 0.2000 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 11 | \n",
+ " global | \n",
+ " 0.6500 | \n",
+ " 0.9500 | \n",
+ " 0.3000 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 11 | \n",
+ " paired | \n",
+ " 0.8000 | \n",
+ " 0.9000 | \n",
+ " 0.1000 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 12 | \n",
+ " global | \n",
+ " 0.7500 | \n",
+ " 0.9500 | \n",
+ " 0.2000 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 12 | \n",
+ " paired | \n",
+ " 0.6500 | \n",
+ " 0.7000 | \n",
+ " 0.0500 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "model participant task ProtoECGNet ProtoSSL difference\n",
+ "0 5 global 0.9000 1.0000 0.1000\n",
+ "1 5 paired 0.9000 1.0000 0.1000\n",
+ "2 7 global 0.7000 0.8000 0.1000\n",
+ "3 7 paired 0.6000 0.8000 0.2000\n",
+ "4 8 global 0.4500 0.8000 0.3500\n",
+ "5 8 paired 0.5000 0.5500 0.0500\n",
+ "6 9 global 0.6000 0.9500 0.3500\n",
+ "7 9 paired 0.5500 0.9000 0.3500\n",
+ "8 10 global 0.6000 0.9500 0.3500\n",
+ "9 10 paired 0.7500 0.9500 0.2000\n",
+ "10 11 global 0.6500 0.9500 0.3000\n",
+ "11 11 paired 0.8000 0.9000 0.1000\n",
+ "12 12 global 0.7500 0.9500 0.2000\n",
+ "13 12 paired 0.6500 0.7000 0.0500"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "participant_summary = (\n",
+ " yesno.groupby([\"participant\", \"task\", \"model\"])[\"good\"]\n",
+ " .mean()\n",
+ " .unstack(\"model\")\n",
+ " .reset_index()\n",
+ ")\n",
+ "\n",
+ "participant_summary[\"difference\"] = (\n",
+ " participant_summary[\"ProtoSSL\"] - participant_summary[\"ProtoECGNet\"]\n",
+ ")\n",
+ "\n",
+ "participant_summary\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "b6ff4f59",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " task | \n",
+ " n_participants | \n",
+ " ProtoSSL_mean | \n",
+ " ProtoECGNet_mean | \n",
+ " mean_difference | \n",
+ " ci95_low | \n",
+ " ci95_high | \n",
+ " wilcoxon_W | \n",
+ " wilcoxon_p | \n",
+ " paired_t_p | \n",
+ " sign_test_p | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " global | \n",
+ " 7 | \n",
+ " 0.9143 | \n",
+ " 0.6643 | \n",
+ " 0.2500 | \n",
+ " 0.1432 | \n",
+ " 0.3568 | \n",
+ " 0.0000 | \n",
+ " 0.0156 | \n",
+ " 0.0012 | \n",
+ " 0.0156 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " paired | \n",
+ " 7 | \n",
+ " 0.8286 | \n",
+ " 0.6786 | \n",
+ " 0.1500 | \n",
+ " 0.0501 | \n",
+ " 0.2499 | \n",
+ " 0.0000 | \n",
+ " 0.0156 | \n",
+ " 0.0104 | \n",
+ " 0.0156 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " task n_participants ProtoSSL_mean ProtoECGNet_mean mean_difference ci95_low ci95_high wilcoxon_W wilcoxon_p paired_t_p sign_test_p\n",
+ "0 global 7 0.9143 0.6643 0.2500 0.1432 0.3568 0.0000 0.0156 0.0012 0.0156\n",
+ "1 paired 7 0.8286 0.6786 0.1500 0.0501 0.2499 0.0000 0.0156 0.0104 0.0156"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "def participant_level_analysis(df):\n",
+ " rows = []\n",
+ " raw_wilcoxon_p = []\n",
+ "\n",
+ " for task in [\"global\", \"paired\"]:\n",
+ " sub = df[df[\"task\"] == task].copy()\n",
+ " diffs = sub[\"difference\"].to_numpy()\n",
+ "\n",
+ " w = wilcoxon(diffs, alternative=\"two-sided\", zero_method=\"wilcox\", method=\"exact\")\n",
+ " ttest = ttest_rel(sub[\"ProtoSSL\"], sub[\"ProtoECGNet\"])\n",
+ " sign = binomtest(np.sum(diffs > 0), np.sum(diffs != 0), p=0.5, alternative=\"two-sided\")\n",
+ "\n",
+ " mean_diff = float(np.mean(diffs))\n",
+ " sd_diff = float(np.std(diffs, ddof=1))\n",
+ " se_diff = sd_diff / math.sqrt(len(diffs))\n",
+ " tcrit = tdist.ppf(0.975, df=len(diffs) - 1)\n",
+ " ci_low = mean_diff - tcrit * se_diff\n",
+ " ci_high = mean_diff + tcrit * se_diff\n",
+ "\n",
+ " rows.append(\n",
+ " {\n",
+ " \"task\": task,\n",
+ " \"n_participants\": len(sub),\n",
+ " \"ProtoSSL_mean\": sub[\"ProtoSSL\"].mean(),\n",
+ " \"ProtoECGNet_mean\": sub[\"ProtoECGNet\"].mean(),\n",
+ " \"mean_difference\": mean_diff,\n",
+ " \"ci95_low\": ci_low,\n",
+ " \"ci95_high\": ci_high,\n",
+ " \"wilcoxon_W\": float(w.statistic),\n",
+ " \"wilcoxon_p\": float(w.pvalue),\n",
+ " \"paired_t_p\": float(ttest.pvalue),\n",
+ " \"sign_test_p\": float(sign.pvalue),\n",
+ " }\n",
+ " )\n",
+ " raw_wilcoxon_p.append(float(w.pvalue))\n",
+ "\n",
+ " out = pd.DataFrame(rows)\n",
+ " return out\n",
+ "\n",
+ "primary_results = participant_level_analysis(participant_summary)\n",
+ "primary_results\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c182260",
+ "metadata": {},
+ "source": [
+ "## Per-label participant-level summaries\n",
+ "\n",
+ "These are useful to show **where** the overall pattern comes from, but I recommend keeping them **descriptive only** in the paper because each label has only 5 cases.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "53b57b45",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " task | \n",
+ " label | \n",
+ " ProtoSSL_mean | \n",
+ " ProtoECGNet_mean | \n",
+ " mean_difference | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " global | \n",
+ " AMI | \n",
+ " 0.8571 | \n",
+ " 0.6571 | \n",
+ " 0.2000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " global | \n",
+ " CLBBB | \n",
+ " 0.9143 | \n",
+ " 0.9429 | \n",
+ " -0.0286 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " global | \n",
+ " CRBBB | \n",
+ " 0.9143 | \n",
+ " 0.5429 | \n",
+ " 0.3714 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " global | \n",
+ " PVC | \n",
+ " 0.9714 | \n",
+ " 0.5143 | \n",
+ " 0.4571 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " paired | \n",
+ " AMI | \n",
+ " 0.8000 | \n",
+ " 0.6571 | \n",
+ " 0.1429 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " paired | \n",
+ " CLBBB | \n",
+ " 0.7714 | \n",
+ " 0.9429 | \n",
+ " -0.1714 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " paired | \n",
+ " CRBBB | \n",
+ " 0.8571 | \n",
+ " 0.5429 | \n",
+ " 0.3143 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " paired | \n",
+ " PVC | \n",
+ " 0.8857 | \n",
+ " 0.5714 | \n",
+ " 0.3143 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " task label ProtoSSL_mean ProtoECGNet_mean mean_difference\n",
+ "0 global AMI 0.8571 0.6571 0.2000\n",
+ "1 global CLBBB 0.9143 0.9429 -0.0286\n",
+ "2 global CRBBB 0.9143 0.5429 0.3714\n",
+ "3 global PVC 0.9714 0.5143 0.4571\n",
+ "4 paired AMI 0.8000 0.6571 0.1429\n",
+ "5 paired CLBBB 0.7714 0.9429 -0.1714\n",
+ "6 paired CRBBB 0.8571 0.5429 0.3143\n",
+ "7 paired PVC 0.8857 0.5714 0.3143"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "participant_by_label = (\n",
+ " yesno.groupby([\"participant\", \"task\", \"label\", \"model\"])[\"good\"]\n",
+ " .mean()\n",
+ " .unstack(\"model\")\n",
+ " .reset_index()\n",
+ ")\n",
+ "participant_by_label[\"difference\"] = (\n",
+ " participant_by_label[\"ProtoSSL\"] - participant_by_label[\"ProtoECGNet\"]\n",
+ ")\n",
+ "\n",
+ "per_label_summary = (\n",
+ " participant_by_label.groupby([\"task\", \"label\"])\n",
+ " .agg(\n",
+ " ProtoSSL_mean=(\"ProtoSSL\", \"mean\"),\n",
+ " ProtoECGNet_mean=(\"ProtoECGNet\", \"mean\"),\n",
+ " mean_difference=(\"difference\", \"mean\"),\n",
+ " )\n",
+ " .reset_index()\n",
+ ")\n",
+ "per_label_summary\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e1dd8d16",
+ "metadata": {},
+ "source": [
+ "### Descriptive summaries for the comparative A/B/Both/Neither question"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "06bb6c2f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | preference | \n",
+ " task | \n",
+ " Both | \n",
+ " Neither | \n",
+ " ProtoECGNet | \n",
+ " ProtoSSL | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " global | \n",
+ " 51 | \n",
+ " 0 | \n",
+ " 25 | \n",
+ " 64 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " paired | \n",
+ " 34 | \n",
+ " 6 | \n",
+ " 36 | \n",
+ " 64 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "preference task Both Neither ProtoECGNet ProtoSSL\n",
+ "0 global 51 0 25 64\n",
+ "1 paired 34 6 36 64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preference_overall = (\n",
+ " prefs.groupby([\"task\", \"preference\"])\n",
+ " .size()\n",
+ " .unstack(fill_value=0)\n",
+ " .reset_index()\n",
+ ")\n",
+ "preference_overall\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "a324d62e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | preference | \n",
+ " task | \n",
+ " label | \n",
+ " Both | \n",
+ " Neither | \n",
+ " ProtoECGNet | \n",
+ " ProtoSSL | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " global | \n",
+ " AMI | \n",
+ " 16 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " global | \n",
+ " CLBBB | \n",
+ " 17 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " global | \n",
+ " CRBBB | \n",
+ " 8 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " global | \n",
+ " PVC | \n",
+ " 10 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " paired | \n",
+ " AMI | \n",
+ " 8 | \n",
+ " 3 | \n",
+ " 9 | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " paired | \n",
+ " CLBBB | \n",
+ " 12 | \n",
+ " 2 | \n",
+ " 17 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " paired | \n",
+ " CRBBB | \n",
+ " 8 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " paired | \n",
+ " PVC | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "preference task label Both Neither ProtoECGNet ProtoSSL\n",
+ "0 global AMI 16 0 5 14\n",
+ "1 global CLBBB 17 0 10 8\n",
+ "2 global CRBBB 8 0 5 22\n",
+ "3 global PVC 10 0 5 20\n",
+ "4 paired AMI 8 3 9 15\n",
+ "5 paired CLBBB 12 2 17 4\n",
+ "6 paired CRBBB 8 0 5 22\n",
+ "7 paired PVC 6 1 5 23"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preference_by_label = (\n",
+ " prefs.groupby([\"task\", \"label\", \"preference\"])\n",
+ " .size()\n",
+ " .unstack(fill_value=0)\n",
+ " .reset_index()\n",
+ ")\n",
+ "preference_by_label\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0aba0956",
+ "metadata": {},
+ "source": [
+ "### Fleiss' kappa for the binary yes/no ratings\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "0d869989",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overall Fleiss' kappa: 0.2877\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/9j/f0qlzhxj2klgf3bxm77sqz300000gn/T/ipykernel_25474/3937073973.py:14: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+ " .apply(fleiss_from_yesno)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " label | \n",
+ " fleiss_kappa | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " AMI | \n",
+ " 0.2023 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " CLBBB | \n",
+ " 0.0542 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " CRBBB | \n",
+ " 0.3000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " PVC | \n",
+ " 0.4000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " label fleiss_kappa\n",
+ "0 AMI 0.2023\n",
+ "1 CLBBB 0.0542\n",
+ "2 CRBBB 0.3000\n",
+ "3 PVC 0.4000"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def fleiss_from_yesno(df):\n",
+ " table = []\n",
+ " for _, g in df.groupby([\"case_id\", \"task\", \"model\"]):\n",
+ " counts = g[\"good\"].value_counts().reindex([0, 1], fill_value=0)\n",
+ " table.append(counts.values)\n",
+ "\n",
+ " table = np.asarray(table)\n",
+ " return float(fleiss_kappa(table))\n",
+ "\n",
+ "overall_kappa = fleiss_from_yesno(yesno)\n",
+ "\n",
+ "kappa_by_label = (\n",
+ " yesno.groupby(\"label\", group_keys=False)\n",
+ " .apply(fleiss_from_yesno)\n",
+ " .rename(\"fleiss_kappa\")\n",
+ " .reset_index()\n",
+ ")\n",
+ "\n",
+ "print(\"Overall Fleiss' kappa:\", round(overall_kappa, 4))\n",
+ "kappa_by_label\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80bfa9be",
+ "metadata": {},
+ "source": [
+ "### Compact summary tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "64519600",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " task | \n",
+ " n_participants | \n",
+ " ProtoSSL_mean | \n",
+ " ProtoECGNet_mean | \n",
+ " mean_difference | \n",
+ " ci95_low | \n",
+ " ci95_high | \n",
+ " wilcoxon_W | \n",
+ " wilcoxon_p | \n",
+ " paired_t_p | \n",
+ " sign_test_p | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " global | \n",
+ " 7 | \n",
+ " 0.9143 | \n",
+ " 0.6643 | \n",
+ " 0.2500 | \n",
+ " 0.1432 | \n",
+ " 0.3568 | \n",
+ " 0.0000 | \n",
+ " 0.0156 | \n",
+ " 0.0012 | \n",
+ " 0.0156 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " paired | \n",
+ " 7 | \n",
+ " 0.8286 | \n",
+ " 0.6786 | \n",
+ " 0.1500 | \n",
+ " 0.0501 | \n",
+ " 0.2499 | \n",
+ " 0.0000 | \n",
+ " 0.0156 | \n",
+ " 0.0104 | \n",
+ " 0.0156 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " task n_participants ProtoSSL_mean ProtoECGNet_mean mean_difference ci95_low ci95_high wilcoxon_W wilcoxon_p paired_t_p sign_test_p\n",
+ "0 global 7 0.9143 0.6643 0.2500 0.1432 0.3568 0.0000 0.0156 0.0012 0.0156\n",
+ "1 paired 7 0.8286 0.6786 0.1500 0.0501 0.2499 0.0000 0.0156 0.0104 0.0156"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "primary_results_rounded = primary_results.copy()\n",
+ "for col in [\"ProtoSSL_mean\", \"ProtoECGNet_mean\", \"mean_difference\", \"ci95_low\", \"ci95_high\", \"wilcoxon_p\", \"paired_t_p\", \"sign_test_p\"]:\n",
+ " primary_results_rounded[col] = primary_results_rounded[col].round(4)\n",
+ "primary_results_rounded\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "67c346fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | model | \n",
+ " task | \n",
+ " label | \n",
+ " ProtoECGNet | \n",
+ " ProtoSSL | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " global | \n",
+ " AMI | \n",
+ " 23/35 (65.7%) | \n",
+ " 30/35 (85.7%) | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " global | \n",
+ " CLBBB | \n",
+ " 33/35 (94.3%) | \n",
+ " 32/35 (91.4%) | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " global | \n",
+ " CRBBB | \n",
+ " 19/35 (54.3%) | \n",
+ " 32/35 (91.4%) | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " global | \n",
+ " PVC | \n",
+ " 18/35 (51.4%) | \n",
+ " 34/35 (97.1%) | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " paired | \n",
+ " AMI | \n",
+ " 23/35 (65.7%) | \n",
+ " 28/35 (80.0%) | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " paired | \n",
+ " CLBBB | \n",
+ " 33/35 (94.3%) | \n",
+ " 27/35 (77.1%) | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " paired | \n",
+ " CRBBB | \n",
+ " 19/35 (54.3%) | \n",
+ " 30/35 (85.7%) | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " paired | \n",
+ " PVC | \n",
+ " 20/35 (57.1%) | \n",
+ " 31/35 (88.6%) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "model task label ProtoECGNet ProtoSSL\n",
+ "0 global AMI 23/35 (65.7%) 30/35 (85.7%)\n",
+ "1 global CLBBB 33/35 (94.3%) 32/35 (91.4%)\n",
+ "2 global CRBBB 19/35 (54.3%) 32/35 (91.4%)\n",
+ "3 global PVC 18/35 (51.4%) 34/35 (97.1%)\n",
+ "4 paired AMI 23/35 (65.7%) 28/35 (80.0%)\n",
+ "5 paired CLBBB 33/35 (94.3%) 27/35 (77.1%)\n",
+ "6 paired CRBBB 19/35 (54.3%) 30/35 (85.7%)\n",
+ "7 paired PVC 20/35 (57.1%) 31/35 (88.6%)"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "label_yesno_pivot = label_yesno.copy()\n",
+ "label_yesno_pivot[\"summary\"] = (\n",
+ " label_yesno_pivot[\"n_yes\"].astype(str)\n",
+ " + \"/\"\n",
+ " + label_yesno_pivot[\"n_total\"].astype(str)\n",
+ " + \" (\"\n",
+ " + (100 * label_yesno_pivot[\"proportion\"]).round(1).astype(str)\n",
+ " + \"%)\"\n",
+ ")\n",
+ "label_yesno_pivot = (\n",
+ " label_yesno_pivot[[\"task\", \"label\", \"model\", \"summary\"]]\n",
+ " .pivot(index=[\"task\", \"label\"], columns=\"model\", values=\"summary\")\n",
+ " .reset_index()\n",
+ ")\n",
+ "label_yesno_pivot\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "51807e2d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "900aa255",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "ecg_env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.18"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}