import React from 'react';
import { motion } from 'framer-motion';
import { Link } from 'react-router-dom';
import { AuthorSection } from '../shared/Shared';
import { PageMetadata } from '../shared/PageMetadata';
import { ARTICLE_DATES } from '../../constants/dates';

const pageMetadata = {
  title: 'a2z-1 External Validation: Rigorous Testing in Real-World Settings',
  description: 'Explore the comprehensive external validation results of a2z-1, demonstrating its robust performance across multiple healthcare institutions and diverse patient populations.',
  type: 'article',
  publishedDate: ARTICLE_DATES.EXTERNAL_VALIDATION,
  modifiedDate: ARTICLE_DATES.EXTERNAL_VALIDATION,
  authors: [
    {
      name: 'Pranav Rajpurkar',
      url: 'https://www.linkedin.com/in/pranavrajpurkar/'
    },
    {
      name: 'Samir Rajpurkar',
      url: 'https://www.linkedin.com/in/samir-rajpurkar-79291396/'
    }
  ]
};

const A2ZExternalValidation: React.FC<{ openCalendly: () => void }> = ({ openCalendly }) => {
  return (
    <div className="min-h-screen bg-white dark:bg-dark text-gray-900 dark:text-gray-100">
      <PageMetadata {...pageMetadata} />

      <main className="max-w-4xl mx-auto py-16 px-4 sm:px-6 lg:px-8">
        <motion.h1
          className="text-4xl sm:text-5xl font-bold mb-8 text-primary dark:text-accent"
          initial={{ opacity: 0, y: 20 }}
          animate={{ opacity: 1, y: 0 }}
          transition={{ duration: 0.6 }}
        >
          {pageMetadata.title}
        </motion.h1>

        <AuthorSection date="October 21, 2024" />

        <motion.div
          className="text-md sm:text-xl space-y-6 text-gray-800 dark:text-gray-200"
          initial={{ opacity: 0 }}
          animate={{ opacity: 1 }}
          transition={{ duration: 0.6, delay: 0.4 }}
        >
          <p>
            a2z-1 stands as the best-in-class AI model for abdomen-pelvis CT imaging, offering unmatched analytical capabilities across 21 critical conditions. In this post, we delve into why a2z-1 leads the field, how it has been rigorously validated across three distinct health systems in a large-scale study involving over 14,000 CT scans, and what makes it a groundbreaking tool in radiology.
          </p>

          <p>
            <strong>The 21 targeted findings are:</strong> small bowel obstruction, retroperitoneal hemorrhage, hepatic steatosis, unruptured aortic aneurysm, appendicitis, cholecystitis, diverticulitis, biliary ductal dilatation, coronary artery calcification, hydronephrosis, splenomegaly, hiatal hernia, obstructive kidney stone, pneumonia, pyelonephritis, cirrhosis, acute pancreatitis, free air, abscess, aortic dissection, and unruptured aortic aneurysm.
          </p>

          <h2 className="text-2xl font-bold mt-8 mb-4">Evaluating a2z-1 Across Three Health Systems</h2>

          <p>
            To ensure that a2z-1 performs effectively across different clinical environments, we evaluated its capabilities using retrospective data from three health systems: one internal validation dataset and two independent external datasets. The external datasets represent entirely distinct health systems, ensuring the model was thoroughly tested in diverse settings.
          </p>

          <ul className="list-disc pl-6 space-y-2">
            <li><strong>Internal Validation (AUC1):</strong> This dataset comprised 9,223 studies from distinct studies with no overlapping patients within the same health system as the training data. It was set aside for testing purposes, providing a baseline measure of performance.</li>
            <li><strong>External Validation (AUC2):</strong> This dataset included 1,860 studies from a different health system, using unique imaging protocols and representing distinct patient demographics.</li>
            <li><strong>External Validation (AUC3):</strong> The second external dataset consisted of 3,584 studies from yet another health system, further diversifying the evaluation.</li>
          </ul>

          <p>
            In total, our validation efforts encompassed 14,667 CT studies across three distinct health systems, providing a comprehensive and large-scale evaluation of a2z-1's performance and generalizability.
          </p>

          <img
            src="/figs/validation/external/auc_performance_comparison.png"
            alt="AUC Performance Comparison"
            className="my-8 mx-auto rounded-lg"
          />

          <h2 className="text-2xl font-bold mt-8 mb-4">Why AUC Matters: A Consistent, Understandable Metric</h2>

          <p>
            We use the Area Under the Receiver Operating Characteristic Curve (AUC) to evaluate a2z-1's ability to distinguish between cases with and without a specific condition. AUC values range from 0.5 (equivalent to random guessing) to 1.0 (perfect discrimination). This metric helps us ensure that the model can effectively differentiate cases, regardless of the prevalence or subtlety of the condition.
          </p>

          <p>
            Interestingly, we observed that in some cases, a2z-1 performed even better on external validation datasets compared to the internal dataset. This can happen when the external data aligns better with the model's learned features, suggesting that differences in patient populations or case characteristics might make certain findings easier to detect. Symmetrically, a drop in performance in external datasets does not necessarily indicate poor generalization; it could simply reflect more challenging case mixes or different imaging conditions. This highlights the complexity of evaluating AI performance in real-world settings and underscores the importance of thorough, multi-site validation.
          </p>

          <h2 className="text-2xl font-bold mt-8 mb-4">Performance Highlights: Robustness Across 21 Conditions</h2>

          <p>
            The large-scale validation efforts provided a clear view of a2z-1's strong and consistent performance across all three datasets, especially for time-sensitive and critical findings. Here are some of the key highlights:
          </p>

          <h3 className="text-xl font-semibold mt-6 mb-3">High Performance for Critical Findings</h3>

          <ul className="list-disc pl-6 space-y-2">
            <li><strong>Small Bowel Obstruction:</strong> The model showed AUCs of 0.979 (internal), 0.981 (external site 1), and 0.947 (external site 2).</li>
            <li><strong>Acute Pancreatitis:</strong> A2z-1 achieved AUCs of 0.961 (internal), 0.944 (external site 1), and 0.972 (external site 2).</li>
            <li><strong>Unruptured Aortic Aneurysm:</strong> AUC values were 0.970 (internal), 0.988 (external site 1), and 0.954 (external site 2).</li>
            <li><strong>Retroperitoneal Hemorrhage:</strong> A2z-1 demonstrated strong performance with AUCs of 0.952 (internal), 0.991 (external site 1), and 0.952 (external site 2), highlighting its consistent ability to detect this critical condition across different settings.</li>
          </ul>

          <h3 className="text-xl font-semibold mt-6 mb-3">Consistency in Common Abdominal Findings</h3>

          <ul className="list-disc pl-6 space-y-2">
            <li><strong>Appendicitis:</strong> The AUC scores of 0.941 (internal), 0.948 (external site 1), and 0.931 (external site 2) highlight the model's ability to reliably detect this common condition.</li>
            <li><strong>Gallbladder: Cholecystitis:</strong> AUC values ranged from 0.928 (internal) to 0.965 (external site 1), suggesting that certain clinical features or imaging techniques may enhance the model's detection capabilities at specific sites.</li>
            <li><strong>Hydronephrosis:</strong> The model achieved high AUC values of 0.963 (internal), 0.962 (external site 1), and 0.955 (external site 2), indicating consistent detection performance for this renal condition across all datasets.</li>
            <li><strong>Colitis:</strong> AUC values for colitis detection were 0.872 (internal), 0.854 (external site 1), and 0.852 (external site 2), showing reliable, albeit slightly lower, performance compared to other conditions, which could be influenced by variability in imaging presentations of colitis.</li>
          </ul>

          <h2 className="text-2xl font-bold mt-8 mb-4">Utilizing the Insights from a2z-1</h2>

          <p>
            The multi-disease approach of a2z-1 offers unique opportunities to improve analytical depth in abdomen-pelvis CT analysis. Here's how these insights can benefit radiology departments and broader healthcare initiatives:
          </p>

          <h3 className="text-xl font-semibold mt-6 mb-3">Quality Assurance and Practice Improvement</h3>

          <ul className="list-disc pl-6 space-y-2">
            <li><strong>Identifying Patterns and Trends:</strong> By analyzing imaging data across 21 findings, a2z-1 helps radiology teams identify patterns and trends.</li>
            <li><strong>Broad-Based Analytics:</strong> With insights into 21 specific conditions, a2z-1 provides radiology departments with valuable analytics, enabling them to benchmark their performance across these conditions and identify areas for improvement.</li>
          </ul>

          <h2 className="text-2xl font-bold mt-8 mb-4">More to Come: Deeper Insights and Analysis</h2>

          <p>
            This post represents Part 1 of our comprehensive analysis of a2z-1's performance. In upcoming posts, we'll delve deeper into:
          </p>

          <ul className="list-disc pl-6 space-y-2">
            <li><strong>Subgroup Analysis:</strong> We'll explore how a2z-1 performs across different patient demographics, imaging protocols, and clinical scenarios.</li>
            <li><strong>AI-Assisted Analytics:</strong> We'll explore how a2z-1's analysis can provide valuable insights to support quality improvement initiatives and enhance overall radiology practice performance.</li>
          </ul>

          <p>
            Stay tuned for these upcoming analyses that will further demonstrate the capabilities and potential impact of a2z-1 in clinical practice.
          </p>

          <h2 className="text-2xl font-bold mt-8 mb-4">Conclusion</h2>

          <p>
            The large-scale evaluation of a2z-1 across three health systems, involving over 14,000 CT studies, demonstrates how a well-designed AI model can deliver powerful analytical insights into abdomen-pelvis CT imaging. Our focus on multi-disease analysis provides radiology departments with valuable data that can enhance quality assurance, support benchmarking, and foster continuous improvement.
          </p>

          <p>
            As we continue to evolve and validate a2z-1, we look forward to helping radiology departments leverage AI to create a safer, more reliable future for imaging.
          </p>
        </motion.div>

        <motion.div
          className="mt-12"
          initial={{ opacity: 0 }}
          animate={{ opacity: 1 }}
          transition={{ duration: 0.6, delay: 1.8 }}
        >
          <Link to="/" className="text-primary dark:text-accent hover:underline">
            &larr; Back to Home
          </Link>
        </motion.div>
      </main>
    </div>
  );
};

export default A2ZExternalValidation;
