Spaces:

SreekarB
/

AphasiaPred

Sleeping

App Files Files Community

SreekarB commited on Mar 12, 2025

Commit

c48ac75

verified ·

1 Parent(s): d526dee

Upload 20 files

Browse files

Files changed (12) hide show

README.md +1 -1
app.py +431 -189
config.py +0 -1
demo_fc_visualization.py +73 -0
demovae/model.py +225 -0
demovae/sklearn.py +124 -0
fc_visualization.py +349 -0
huggingface_fc_visualization.py +489 -0
main.py +4 -12
rcf_prediction.py +34 -93
src/.DS_Store +0 -0
vae_model.py +89 -12

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🧠
 colorFrom: blue
 colorTo: pink
 sdk: gradio
-sdk_version: 5.20.1
 app_file: app.py
 pinned: false
 ---

 colorFrom: blue
 colorTo: pink
 sdk: gradio
+sdk_version: 3.36.1
 app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from data_preprocessing import preprocess_fmri_to_fc, process_single_fmri
 from visualization import plot_fc_matrices, plot_learning_curves
 import os
 import glob
-from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
 import json
 import pickle
 import pandas as pd
@@ -24,6 +24,7 @@ class AphasiaPredictionApp:
         self.predictor = None
         self.trained = False
         self.latent_dim = MODEL_CONFIG['latent_dim']
     def train_models(self, data_dir, latent_dim, nepochs, bsize):
         """
@@ -34,9 +35,8 @@ class AphasiaPredictionApp:
         logger.info(f"VAE params: latent_dim={latent_dim}, epochs={nepochs}, batch_size={bsize}")
         # Default prediction parameters from our config
-        prediction_type = PREDICTION_CONFIG.get('prediction_type', 'regression')
         outcome_variable = PREDICTION_CONFIG.get('default_outcome', 'wab_aq')
-        logger.info(f"Prediction: type={prediction_type}, outcome={outcome_variable}")
         figures = {}
@@ -323,6 +323,8 @@ class AphasiaPredictionApp:
                             try:
                                 real_treatment_file = process_behavioral_data_to_outcomes(csv_path)
                                 treatment_file = real_treatment_file  # Use the real treatment file if processing succeeded
                                 logger.info(f"Using processed behavioral data for treatment outcomes")
                             except Exception as proc_err:
                                 logger.warning(f"Couldn't process behavioral data: {proc_err}, using standard outcomes")
@@ -338,6 +340,8 @@ class AphasiaPredictionApp:
                                 # Use the found file
                                 treatment_file = real_treatment_file
                                 logger.info(f"Using real treatment outcomes file")
                             except Exception as find_err:
                                 logger.warning(f"Couldn't find treatment outcomes file: {find_err}, using standard outcomes")
@@ -754,54 +758,50 @@ class AphasiaPredictionApp:
         # Plot predicted vs actual values
         ax1 = fig.add_subplot(gs[0, 0])
-        if self.predictor.prediction_type == 'regression':
-            # Regression: scatter plot
-            ax1.scatter(y_true, y_pred, alpha=0.7)
-            # Add perfect prediction line
-            min_val = min(np.min(y_true), np.min(y_pred))
-            max_val = max(np.max(y_true), np.max(y_pred))
-            ax1.plot([min_val, max_val], [min_val, max_val], 'r--')
-            ax1.set_xlabel('Actual Values')
-            ax1.set_ylabel('Predicted Values')
-            ax1.set_title('Predicted vs. Actual Values')
-            # Add R² to the plot
-            r2 = r2_score(y_true, y_pred)
-            ax1.text(0.05, 0.95, f'R² = {r2:.4f}', transform=ax1.transAxes,
-                   bbox=dict(facecolor='white', alpha=0.5))
-            # Plot residuals
-            ax2 = fig.add_subplot(gs[0, 1])
-            residuals = y_true - y_pred
-            ax2.scatter(y_pred, residuals, alpha=0.7)
-            ax2.axhline(y=0, color='r', linestyle='--')
-            ax2.set_xlabel('Predicted Values')
-            ax2.set_ylabel('Residuals')
-            ax2.set_title('Residual Plot')
-            # Plot prediction errors
-            ax3 = fig.add_subplot(gs[1, 0])
-            ax3.errorbar(range(len(y_pred)), y_pred, yerr=2*y_std, fmt='o', alpha=0.7,
-                       label='Predicted ± 2σ')
-            ax3.plot(range(len(y_true)), y_true, 'rx', alpha=0.7, label='Actual')
-            ax3.set_xlabel('Sample Index')
-            ax3.set_ylabel('Value')
-            ax3.set_title('Prediction with Error Bars')
-            ax3.legend()
-            # Plot error distribution
-            ax4 = fig.add_subplot(gs[1, 1])
-            ax4.hist(residuals, bins=20, alpha=0.7)
-            ax4.axvline(x=0, color='r', linestyle='--')
-            ax4.set_xlabel('Prediction Error')
-            ax4.set_ylabel('Frequency')
-            ax4.set_title('Error Distribution')
-        else:  # classification
-            # Convert to integer classes if they're strings
-            if isinstance(y_true[0], str) or isinstance(y_pred[0], str):
                 # Create mapping of class labels to integers
                 classes = sorted(list(set(list(y_true) + list(y_pred))))
                 class_to_int = {c: i for i, c in enumerate(classes)}
@@ -911,76 +911,39 @@ class AphasiaPredictionApp:
         """Create learning curve plots from cross-validation results"""
         fig = plt.figure(figsize=(12, 6))
-        # Create a grid for plots
-        if self.predictor.prediction_type == 'regression':
-            # For regression, show R² and RMSE
-            ax1 = plt.subplot(1, 2, 1)
-            ax2 = plt.subplot(1, 2, 2)
-            # Plot R² for each fold
-            for i, metrics in enumerate(fold_metrics):
-                ax1.plot(i+1, metrics['r2'], 'bo')
-            # Plot average R²
-            avg_r2 = np.mean([m['r2'] for m in fold_metrics])
-            ax1.axhline(y=avg_r2, color='r', linestyle='--',
-                       label=f'Average R² = {avg_r2:.4f}')
-            ax1.set_xlabel('Fold')
-            ax1.set_ylabel('R²')
-            ax1.set_title('R² by Fold')
-            ax1.set_xticks(range(1, len(fold_metrics)+1))
-            ax1.legend()
-            # Plot RMSE for each fold
-            for i, metrics in enumerate(fold_metrics):
-                ax2.plot(i+1, metrics['rmse'], 'go')
-            # Plot average RMSE
-            avg_rmse = np.mean([m['rmse'] for m in fold_metrics])
-            ax2.axhline(y=avg_rmse, color='r', linestyle='--',
-                       label=f'Average RMSE = {avg_rmse:.4f}')
-            ax2.set_xlabel('Fold')
-            ax2.set_ylabel('RMSE')
-            ax2.set_title('RMSE by Fold')
-            ax2.set_xticks(range(1, len(fold_metrics)+1))
-            ax2.legend()
-        else:  # classification
-            # For classification, show accuracy and F1
-            ax1 = plt.subplot(1, 2, 1)
-            ax2 = plt.subplot(1, 2, 2)
-            # Plot accuracy for each fold
-            for i, metrics in enumerate(fold_metrics):
-                ax1.plot(i+1, metrics['accuracy'], 'bo')
-            # Plot average accuracy
-            avg_acc = np.mean([m['accuracy'] for m in fold_metrics])
-            ax1.axhline(y=avg_acc, color='r', linestyle='--',
-                       label=f'Average Accuracy = {avg_acc:.4f}')
-            ax1.set_xlabel('Fold')
-            ax1.set_ylabel('Accuracy')
-            ax1.set_title('Accuracy by Fold')
-            ax1.set_xticks(range(1, len(fold_metrics)+1))
-            ax1.legend()
-            # Plot F1 for each fold
-            for i, metrics in enumerate(fold_metrics):
-                ax2.plot(i+1, metrics['f1'], 'go')
-            # Plot average F1
-            avg_f1 = np.mean([m['f1'] for m in fold_metrics])
-            ax2.axhline(y=avg_f1, color='r', linestyle='--',
-                       label=f'Average F1 = {avg_f1:.4f}')
-            ax2.set_xlabel('Fold')
-            ax2.set_ylabel('F1 Score')
-            ax2.set_title('F1 Score by Fold')
-            ax2.set_xticks(range(1, len(fold_metrics)+1))
-            ax2.legend()
         plt.tight_layout()
         return fig
@@ -1350,6 +1313,8 @@ def process_behavioral_data_to_outcomes(behavioral_file):
             outcomes_df = pd.DataFrame(outcome_data)
             outcomes_df.to_csv(outcomes_file, index=False)
             logger.info(f"Created treatment outcomes file with {len(outcomes_df)} patients")
             return outcomes_file
         else:
             # If we couldn't extract outcomes per patient, try a simpler approach
@@ -1375,6 +1340,8 @@ def process_behavioral_data_to_outcomes(behavioral_file):
                 ])
                 outcomes_df.to_csv(outcomes_file, index=False)
                 logger.warning(f"Created simplified treatment outcomes with group improvement: {improvement:.2f}")
                 return outcomes_file
             except Exception as e:
                 logger.error(f"Could not create even simplified outcomes: {e}")
@@ -1858,8 +1825,8 @@ def create_interface():
         gr.Markdown("# Aphasia Treatment Trajectory Prediction")
         with gr.Tabs():
-            # Training Tab
-            with gr.Tab("Train Models"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         data_dir = gr.Textbox(
@@ -1889,39 +1856,72 @@ def create_interface():
                         use_hf_dataset = gr.Checkbox(
                             label="Use HuggingFace Dataset", value=True
                         )
-                        with gr.Group("Prediction Options"):
-                            prediction_type = gr.Radio(
-                                label="Prediction Type",
-                                choices=["regression", "classification"],
-                                value="regression"
-                            )
-                            outcome_variable = gr.Dropdown(
-                                label="Outcome Variable",
-                                choices=["wab_aq", "age", "mpo", "education"],
-                                value="wab_aq"
-                            )
                             skip_behavioral = gr.Checkbox(
                                 label="Skip Behavioral Data Processing",
                                 value=PREDICTION_CONFIG.get('skip_behavioral_data', True),
                                 info="Use pre-defined treatment outcomes instead of processing behavioral data"
                             )
-                            with gr.Accordion("Advanced Data Options", open=False):
-                                use_synthetic_nifti = gr.Checkbox(
-                                    label="Use Synthetic NIfTI Data",
-                                    value=PREDICTION_CONFIG.get('use_synthetic_nifti', False),
-                                    info="Generate synthetic NIfTI files if real ones aren't found"
-                                )
-                                use_synthetic_fc = gr.Checkbox(
-                                    label="Use Synthetic FC Matrices",
-                                    value=PREDICTION_CONFIG.get('use_synthetic_fc', False),
-                                    info="Generate synthetic FC matrices if processing fails"
-                                )
-                train_btn = gr.Button("Train Models", variant="primary")
                 with gr.Row():
-                    fc_plot = gr.Plot(label="FC Analysis")
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -1930,13 +1930,18 @@ def create_interface():
                         prediction_plot = gr.Plot(label="Prediction Performance")
                 with gr.Row():
-                    learning_plot = gr.Plot(label="Cross-validation Results")
-            # Prediction Tab
-            with gr.Tab("Predict Treatment"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                        fmri_file = gr.File(label="Patient fMRI Data")
                     with gr.Column(scale=1):
                         with gr.Group("Patient Demographics"):
                             age = gr.Number(label="Age at Stroke", value=60)
@@ -1952,21 +1957,23 @@ def create_interface():
                 with gr.Row():
                     trajectory_plot = gr.Plot(label="Predicted Treatment Trajectory")
-        # Connect components
-        train_outputs = {
-            'vae': fc_plot,
-            'importance': importance_plot,
-            'prediction': prediction_plot,
-            'learning': learning_plot
         }
-        # Handle train button click
-        def handle_train(data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
-                        prediction_type, outcome_variable, skip_behavioral,
-                        use_synthetic_nifti, use_synthetic_fc):
-            # Set prediction config values for this run
-            PREDICTION_CONFIG['prediction_type'] = prediction_type
-            PREDICTION_CONFIG['default_outcome'] = outcome_variable
             PREDICTION_CONFIG['skip_behavioral_data'] = skip_behavioral
             PREDICTION_CONFIG['use_synthetic_nifti'] = use_synthetic_nifti
             PREDICTION_CONFIG['use_synthetic_fc'] = use_synthetic_fc
@@ -1978,36 +1985,271 @@ def create_interface():
             else:
                 PREDICTION_CONFIG['local_nii_dir'] = None
-            # Log helpful information for the user
-            logger.info(f"Looking for data in directory: {data_dir}")
-            logger.info(f"Expected files: FC_graph_covariate_data.csv and treatment_outcomes.csv")
-            logger.info(f"Prediction type: {prediction_type}, target: {outcome_variable}")
-            results = app.train_models(
-                data_dir=data_dir,
-                latent_dim=latent_dim,
-                nepochs=nepochs,
-                bsize=bsize
-            )
-            # Return plots in the expected order
-            return [
-                results.get('vae', None),
-                results.get('importance', None),
-                results.get('prediction', None),
-                results.get('learning', None)
-            ]
-        train_btn.click(
-            fn=handle_train,
             inputs=[data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
-                   prediction_type, outcome_variable, skip_behavioral,
-                   use_synthetic_nifti, use_synthetic_fc],
-            outputs=[fc_plot, importance_plot, prediction_plot, learning_plot]
         )
         predict_btn.click(
-            fn=app.predict_treatment,
             inputs=[fmri_file, age, sex, months, wab],
             outputs=[prediction_text, trajectory_plot]
         )

 from visualization import plot_fc_matrices, plot_learning_curves
 import os
 import glob
+from sklearn.metrics import mean_squared_error, r2_score
 import json
 import pickle
 import pandas as pd
         self.predictor = None
         self.trained = False
         self.latent_dim = MODEL_CONFIG['latent_dim']
+        self.last_treatment_file = None  # Track the last treatment file used
     def train_models(self, data_dir, latent_dim, nepochs, bsize):
         """
         logger.info(f"VAE params: latent_dim={latent_dim}, epochs={nepochs}, batch_size={bsize}")
         # Default prediction parameters from our config
         outcome_variable = PREDICTION_CONFIG.get('default_outcome', 'wab_aq')
+        logger.info(f"Prediction: type=regression, outcome={outcome_variable}")
         figures = {}
                             try:
                                 real_treatment_file = process_behavioral_data_to_outcomes(csv_path)
                                 treatment_file = real_treatment_file  # Use the real treatment file if processing succeeded
+                                # Store the treatment file path for later use
+                                self.last_treatment_file = treatment_file
                                 logger.info(f"Using processed behavioral data for treatment outcomes")
                             except Exception as proc_err:
                                 logger.warning(f"Couldn't process behavioral data: {proc_err}, using standard outcomes")
                                 # Use the found file
                                 treatment_file = real_treatment_file
+                                # Store the treatment file path for later use
+                                self.last_treatment_file = treatment_file
                                 logger.info(f"Using real treatment outcomes file")
                             except Exception as find_err:
                                 logger.warning(f"Couldn't find treatment outcomes file: {find_err}, using standard outcomes")
         # Plot predicted vs actual values
         ax1 = fig.add_subplot(gs[0, 0])
+        # Regression plots
+        # Scatter plot
+        ax1.scatter(y_true, y_pred, alpha=0.7)
+        # Add perfect prediction line
+        min_val = min(np.min(y_true), np.min(y_pred))
+        max_val = max(np.max(y_true), np.max(y_pred))
+        ax1.plot([min_val, max_val], [min_val, max_val], 'r--')
+        ax1.set_xlabel('Actual Values')
+        ax1.set_ylabel('Predicted Values')
+        ax1.set_title('Predicted vs. Actual Values')
+        # Add R² to the plot
+        r2 = r2_score(y_true, y_pred)
+        ax1.text(0.05, 0.95, f'R² = {r2:.4f}', transform=ax1.transAxes,
+               bbox=dict(facecolor='white', alpha=0.5))
+        # Plot residuals
+        ax2 = fig.add_subplot(gs[0, 1])
+        residuals = y_true - y_pred
+        ax2.scatter(y_pred, residuals, alpha=0.7)
+        ax2.axhline(y=0, color='r', linestyle='--')
+        ax2.set_xlabel('Predicted Values')
+        ax2.set_ylabel('Residuals')
+        ax2.set_title('Residual Plot')
+        # Plot prediction errors
+        ax3 = fig.add_subplot(gs[1, 0])
+        ax3.errorbar(range(len(y_pred)), y_pred, yerr=2*y_std, fmt='o', alpha=0.7,
+                   label='Predicted ± 2σ')
+        ax3.plot(range(len(y_true)), y_true, 'rx', alpha=0.7, label='Actual')
+        ax3.set_xlabel('Sample Index')
+        ax3.set_ylabel('Value')
+        ax3.set_title('Prediction with Error Bars')
+        ax3.legend()
+        # Plot error distribution
+        ax4 = fig.add_subplot(gs[1, 1])
+        ax4.hist(residuals, bins=20, alpha=0.7)
+        ax4.axvline(x=0, color='r', linestyle='--')
+        ax4.set_xlabel('Prediction Error')
+        ax4.set_ylabel('Frequency')
+        ax4.set_title('Error Distribution')
                 # Create mapping of class labels to integers
                 classes = sorted(list(set(list(y_true) + list(y_pred))))
                 class_to_int = {c: i for i, c in enumerate(classes)}
         """Create learning curve plots from cross-validation results"""
         fig = plt.figure(figsize=(12, 6))
+        # For regression, show R² and RMSE
+        ax1 = plt.subplot(1, 2, 1)
+        ax2 = plt.subplot(1, 2, 2)
+        # Plot R² for each fold
+        for i, metrics in enumerate(fold_metrics):
+            ax1.plot(i+1, metrics['r2'], 'bo')
+        # Plot average R²
+        avg_r2 = np.mean([m['r2'] for m in fold_metrics])
+        ax1.axhline(y=avg_r2, color='r', linestyle='--',
+                   label=f'Average R² = {avg_r2:.4f}')
+        ax1.set_xlabel('Fold')
+        ax1.set_ylabel('R²')
+        ax1.set_title('R² by Fold')
+        ax1.set_xticks(range(1, len(fold_metrics)+1))
+        ax1.legend()
+        # Plot RMSE for each fold
+        for i, metrics in enumerate(fold_metrics):
+            ax2.plot(i+1, metrics['rmse'], 'go')
+        # Plot average RMSE
+        avg_rmse = np.mean([m['rmse'] for m in fold_metrics])
+        ax2.axhline(y=avg_rmse, color='r', linestyle='--',
+                   label=f'Average RMSE = {avg_rmse:.4f}')
+        ax2.set_xlabel('Fold')
+        ax2.set_ylabel('RMSE')
+        ax2.set_title('RMSE by Fold')
+        ax2.set_xticks(range(1, len(fold_metrics)+1))
+        ax2.legend()
         plt.tight_layout()
         return fig
             outcomes_df = pd.DataFrame(outcome_data)
             outcomes_df.to_csv(outcomes_file, index=False)
             logger.info(f"Created treatment outcomes file with {len(outcomes_df)} patients")
+            # Store the treatment file path for later use
+            self.last_treatment_file = outcomes_file
             return outcomes_file
         else:
             # If we couldn't extract outcomes per patient, try a simpler approach
                 ])
                 outcomes_df.to_csv(outcomes_file, index=False)
                 logger.warning(f"Created simplified treatment outcomes with group improvement: {improvement:.2f}")
+                # Store the treatment file path for later use
+                self.last_treatment_file = outcomes_file
                 return outcomes_file
             except Exception as e:
                 logger.error(f"Could not create even simplified outcomes: {e}")
         gr.Markdown("# Aphasia Treatment Trajectory Prediction")
         with gr.Tabs():
+            # Tab 1: VAE Training
+            with gr.Tab("1. VAE Training"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         data_dir = gr.Textbox(
                         use_hf_dataset = gr.Checkbox(
                             label="Use HuggingFace Dataset", value=True
                         )
+                        with gr.Accordion("Advanced Data Options", open=False):
                             skip_behavioral = gr.Checkbox(
                                 label="Skip Behavioral Data Processing",
                                 value=PREDICTION_CONFIG.get('skip_behavioral_data', True),
                                 info="Use pre-defined treatment outcomes instead of processing behavioral data"
                             )
+                            use_synthetic_nifti = gr.Checkbox(
+                                label="Use Synthetic NIfTI Data",
+                                value=PREDICTION_CONFIG.get('use_synthetic_nifti', False),
+                                info="Generate synthetic NIfTI files if real ones aren't found"
+                            )
+                            use_synthetic_fc = gr.Checkbox(
+                                label="Use Synthetic FC Matrices",
+                                value=PREDICTION_CONFIG.get('use_synthetic_fc', False),
+                                info="Generate synthetic FC matrices if processing fails"
+                            )
+                train_vae_btn = gr.Button("Train VAE Model", variant="primary")
+                gr.Markdown("### VAE Training Results")
                 with gr.Row():
+                    fc_plot = gr.Plot(label="FC Matrices (Original/Reconstructed/Generated)")
+                with gr.Row():
+                    learning_plot = gr.Plot(label="VAE Learning Curves")
+                gr.Markdown("After VAE training completes, proceed to the 'Random Forest Prediction' tab →")
+            # Tab 2: Random Forest Prediction
+            with gr.Tab("2. Random Forest Prediction"):
+                gr.Markdown("### Random Forest Model Training")
+                gr.Markdown("First complete the VAE training in tab 1, then configure and train the Random Forest model below:")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        prediction_type = gr.Radio(
+                            label="Prediction Type",
+                            choices=["regression", "classification"],
+                            value="regression"
+                        )
+                        outcome_variable = gr.Dropdown(
+                            label="Outcome Variable",
+                            choices=["wab_aq", "age", "mpo", "education"],
+                            value="wab_aq"
+                        )
+                    with gr.Column(scale=1):
+                        rf_n_estimators = gr.Slider(
+                            minimum=10, maximum=500, step=10,
+                            label="Number of Trees", value=100
+                        )
+                        rf_max_depth = gr.Slider(
+                            minimum=3, maximum=50, step=1,
+                            label="Max Tree Depth", value=10,
+                            info="Set to 0 for unlimited depth"
+                        )
+                        rf_cv_folds = gr.Slider(
+                            minimum=2, maximum=10, step=1,
+                            label="Cross-validation Folds", value=5
+                        )
+                train_rf_btn = gr.Button("Train Random Forest Model", variant="primary")
+                gr.Markdown("### Random Forest Results")
                 with gr.Row():
                     with gr.Column(scale=1):
                         prediction_plot = gr.Plot(label="Prediction Performance")
                 with gr.Row():
+                    rf_metrics = gr.Textbox(label="Model Performance Metrics")
+                gr.Markdown("After Random Forest training completes, proceed to the 'Treatment Prediction' tab →")
+            # Tab 3: Predict Treatment
+            with gr.Tab("3. Treatment Prediction"):
+                gr.Markdown("### Predict Individual Treatment Outcomes")
+                gr.Markdown("After completing VAE and Random Forest training, you can predict treatment outcomes for individual patients:")
                 with gr.Row():
                     with gr.Column(scale=1):
+                        fmri_file = gr.File(label="Patient fMRI Data (NIfTI file)")
                     with gr.Column(scale=1):
                         with gr.Group("Patient Demographics"):
                             age = gr.Number(label="Age at Stroke", value=60)
                 with gr.Row():
                     trajectory_plot = gr.Plot(label="Predicted Treatment Trajectory")
+        # Define various handler functions for the different tabs
+        # Store shared state between tabs
+        app_state = {
+            'vae': None,
+            'latents': None,
+            'demographics': None,
+            'predictor': None,
+            'vae_trained': False,
+            'rf_trained': False
         }
+        # Tab 1: VAE Training Handler
+        def handle_vae_training(data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
+                              skip_behavioral, use_synthetic_nifti, use_synthetic_fc):
+            """Train the VAE model and display FC visualization and learning curves"""
+            # Store config values
             PREDICTION_CONFIG['skip_behavioral_data'] = skip_behavioral
             PREDICTION_CONFIG['use_synthetic_nifti'] = use_synthetic_nifti
             PREDICTION_CONFIG['use_synthetic_fc'] = use_synthetic_fc
             else:
                 PREDICTION_CONFIG['local_nii_dir'] = None
+            # Log info
+            logger.info(f"Training VAE model with data from: {data_dir}")
+            logger.info(f"VAE parameters: latent_dim={latent_dim}, epochs={nepochs}, batch_size={bsize}")
+            # Create a subset of app.train_models functionality that just trains the VAE
+            try:
+                # Start by setting up data for the VAE
+                from vae_model import DemoVAE
+                from data_preprocessing import load_and_preprocess_data
+                from main import run_analysis
+                import numpy as np
+                import os
+                # Prepare VAE training parameters
+                MODEL_CONFIG.update({
+                    'latent_dim': latent_dim,
+                    'nepochs': nepochs,
+                    'bsize': bsize
+                })
+                # First, find and preprocess data
+                logger.info("Looking for data in directory and preprocessing...")
+                # This part is similar to app.train_models but only focuses on VAE
+                if data_dir == "SreekarB/OSFData":
+                    # Use dataset, similar to existing code in app.train_models
+                    # For brevity, we'll call the full train_models function but only
+                    # extract the VAE-related results
+                    results = app.train_models(
+                        data_dir=data_dir,
+                        latent_dim=latent_dim,
+                        nepochs=nepochs,
+                        bsize=bsize
+                    )
+                    # Store results in app_state for the next tabs
+                    app_state['vae'] = results.get('vae', None)
+                    app_state['latents'] = results.get('latents', None)
+                    app_state['demographics'] = results.get('demographics', None)
+                    app_state['vae_trained'] = True
+                    # Return just the VAE visualizations
+                    return [
+                        results.get('vae', None),    # FC matrix visualization
+                        results.get('learning', None)  # VAE learning curves
+                    ]
+                else:
+                    # Local directory case
+                    results = app.train_models(
+                        data_dir=data_dir,
+                        latent_dim=latent_dim,
+                        nepochs=nepochs,
+                        bsize=bsize
+                    )
+                    # Store results in app_state
+                    app_state['vae'] = results.get('vae', None)
+                    app_state['latents'] = results.get('latents', None)
+                    app_state['demographics'] = results.get('demographics', None)
+                    app_state['vae_trained'] = True
+                    # Return just the VAE visualizations
+                    return [
+                        results.get('vae', None),    # FC matrix visualization
+                        results.get('learning', None)  # VAE learning curves
+                    ]
+            except Exception as e:
+                logger.error(f"Error in VAE training: {str(e)}", exc_info=True)
+                error_fig = plt.figure(figsize=(10, 6))
+                plt.text(0.5, 0.5, f"Error: {str(e)}",
+                        horizontalalignment='center', verticalalignment='center',
+                        fontsize=12, color='red', wrap=True)
+                plt.axis('off')
+                # Return error figures for both outputs
+                return [error_fig, error_fig]
+        # Tab 2: Random Forest Training Handler
+        def handle_rf_training(outcome_variable, rf_n_estimators, rf_max_depth, rf_cv_folds):
+            """Train the Random Forest model using the VAE latent representations"""
+            # Check if VAE has been trained
+            if not app_state['vae_trained'] or app_state['latents'] is None:
+                error_fig = plt.figure(figsize=(10, 6))
+                message = "Error: You must train the VAE model in Tab 1 first!"
+                plt.text(0.5, 0.5, message,
+                        horizontalalignment='center', verticalalignment='center',
+                        fontsize=14, color='red')
+                plt.axis('off')
+                # Return error for both outputs
+                return [error_fig, error_fig, "Error: VAE not trained. Go to Tab 1 and train the VAE first."]
+            try:
+                # Update RF configuration
+                PREDICTION_CONFIG['default_outcome'] = outcome_variable
+                PREDICTION_CONFIG['n_estimators'] = rf_n_estimators
+                PREDICTION_CONFIG['max_depth'] = rf_max_depth if rf_max_depth > 0 else None
+                PREDICTION_CONFIG['cv_folds'] = rf_cv_folds
+                logger.info(f"Training Random Forest model: outcome={outcome_variable}")
+                logger.info(f"RF parameters: n_estimators={rf_n_estimators}, max_depth={rf_max_depth}, cv_folds={rf_cv_folds}")
+                # Get data from app_state
+                latents = app_state['latents']
+                demographics = app_state['demographics']
+                # Train Random Forest predictor
+                from rcf_prediction import AphasiaTreatmentPredictor
+                import pandas as pd
+                import numpy as np
+                # Need to find treatment outcomes data
+                # This would normally be loaded in train_models, so we need
+                # to mock it here or load from app_state
+                if hasattr(app, 'last_treatment_file') and os.path.exists(app.last_treatment_file):
+                    treatment_file = app.last_treatment_file
+                    treatment_df = pd.read_csv(treatment_file)
+                    treatment_outcomes = treatment_df['outcome_score'].values
+                    # Initialize predictor
+                    predictor = AphasiaTreatmentPredictor(
+                        n_estimators=rf_n_estimators,
+                        max_depth=rf_max_depth if rf_max_depth > 0 else None
+                    )
+                    # Cross-validate
+                    cv_results = predictor.cross_validate(
+                        latents=latents,
+                        demographics=demographics,
+                        treatment_outcomes=treatment_outcomes,
+                        n_splits=rf_cv_folds
+                    )
+                    # Fit final model
+                    predictor.fit(latents, demographics, treatment_outcomes)
+                    # Store in app_state
+                    app_state['predictor'] = predictor
+                    app_state['rf_trained'] = True
+                    # Create feature importance plot
+                    importance_fig = predictor.plot_feature_importance()
+                    # Create prediction performance plot
+                    predictions = cv_results['predictions']
+                    prediction_stds = cv_results['prediction_stds']
+                    performance_fig = plt.figure(figsize=(8, 6))
+                    # Check if we have valid predictions
+                    if len(treatment_outcomes) > 0 and len(predictions) == len(treatment_outcomes):
+                        # Only create scatter plot if we have matching data
+                        plt.scatter(treatment_outcomes, predictions)
+                        # Reference line
+                        min_val = min(np.min(treatment_outcomes), np.min(predictions))
+                        max_val = max(np.max(treatment_outcomes), np.max(predictions))
+                        plt.plot([min_val, max_val], [min_val, max_val], 'r--')
+                        # Confidence band
+                        plt.fill_between(treatment_outcomes,
+                                        predictions - 2*prediction_stds,
+                                        predictions + 2*prediction_stds,
+                                        alpha=0.2, color='gray')
+                        plt.xlabel('Actual Outcome')
+                        plt.ylabel('Predicted Outcome')
+                        # Get performance metrics
+                        metrics_text = ""
+                        mean_metrics = cv_results.get('mean_metrics', {})
+                        r2 = mean_metrics.get('r2', 0)
+                        rmse = mean_metrics.get('rmse', 0)
+                        plt.title(f'Treatment Outcome Prediction\nR² = {r2:.3f}, RMSE = {rmse:.3f}')
+                        metrics_text = f"Regression Model Performance:\nR² = {r2:.4f}\nRMSE = {rmse:.4f}"
+                    else:
+                        # Handle case with no data
+                        plt.text(0.5, 0.5, "No prediction data available",
+                                ha='center', va='center', transform=plt.gca().transAxes)
+                        metrics_text = "No performance metrics available"
+                    plt.tight_layout()
+                    return [importance_fig, performance_fig, metrics_text]
+                else:
+                    # No treatment file available
+                    error_fig = plt.figure(figsize=(10, 6))
+                    message = "Error: Treatment outcomes file not found. Please retrain the VAE in Tab 1."
+                    plt.text(0.5, 0.5, message,
+                            horizontalalignment='center', verticalalignment='center',
+                            fontsize=14, color='red')
+                    plt.axis('off')
+                    return [error_fig, error_fig, "Error: Treatment outcomes file not found."]
+            except Exception as e:
+                logger.error(f"Error in RF training: {str(e)}", exc_info=True)
+                error_fig = plt.figure(figsize=(10, 6))
+                plt.text(0.5, 0.5, f"Error: {str(e)}",
+                        horizontalalignment='center', verticalalignment='center',
+                        fontsize=12, color='red', wrap=True)
+                plt.axis('off')
+                return [error_fig, error_fig, f"Error: {str(e)}"]
+        # Connect the tab handlers
+        # VAE Training tab
+        train_vae_btn.click(
+            fn=handle_vae_training,
             inputs=[data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
+                   skip_behavioral, use_synthetic_nifti, use_synthetic_fc],
+            outputs=[fc_plot, learning_plot]
+        )
+        # Random Forest Training tab
+        train_rf_btn.click(
+            fn=handle_rf_training,
+            inputs=[prediction_type, outcome_variable, rf_n_estimators, rf_max_depth, rf_cv_folds],
+            outputs=[importance_plot, prediction_plot, rf_metrics]
         )
+        # Tab 3: Treatment Prediction Handler
+        def handle_treatment_prediction(fmri_file, age, sex, months, wab):
+            """Predict treatment outcome for a new patient"""
+            # Check if models have been trained
+            if not app_state['vae_trained'] or not app_state['rf_trained']:
+                error_message = "Error: You must train both the VAE (Tab 1) and Random Forest (Tab 2) models first!"
+                error_fig = plt.figure(figsize=(10, 6))
+                plt.text(0.5, 0.5, error_message,
+                        horizontalalignment='center', verticalalignment='center',
+                        fontsize=14, color='red')
+                plt.axis('off')
+                return [error_message, error_fig]
+            # Use the trained models from app_state for prediction
+            try:
+                # Set up prediction
+                if app_state['vae'] is None or app_state['predictor'] is None:
+                    return ["Error: Models not properly trained", None]
+                # Create a temporary prediction app with our trained models
+                temp_app = AphasiaPredictionApp()
+                temp_app.vae = app_state['vae']
+                temp_app.predictor = app_state['predictor']
+                temp_app.trained = True
+                temp_app.latent_dim = app_state['vae'].latent_dim if hasattr(app_state['vae'], 'latent_dim') else 32
+                # Make prediction
+                return temp_app.predict_treatment(
+                    fmri_file=fmri_file,
+                    age=age,
+                    sex=sex,
+                    months_post_stroke=months,
+                    wab_score=wab
+                )
+            except Exception as e:
+                logger.error(f"Error in treatment prediction: {str(e)}", exc_info=True)
+                return [f"Error in prediction: {str(e)}", None]
+        # Connect the treatment prediction handler
         predict_btn.click(
+            fn=handle_treatment_prediction,
             inputs=[fmri_file, age, sex, months, wab],
             outputs=[prediction_text, trajectory_plot]
         )

config.py CHANGED Viewed

@@ -27,7 +27,6 @@ PREDICTION_CONFIG = {
     'n_estimators': 100,
     'max_depth': None,
     'cv_folds': 5,
-    'prediction_type': 'regression',
     'default_outcome': 'wab_aq',
     'save_path': 'results/treatment_predictor.joblib',
     'skip_behavioral_data': True,  # Set to True to skip processing behavioral_data.csv

     'n_estimators': 100,
     'max_depth': None,
     'cv_folds': 5,
     'default_outcome': 'wab_aq',
     'save_path': 'results/treatment_predictor.joblib',
     'skip_behavioral_data': True,  # Set to True to skip processing behavioral_data.csv

demo_fc_visualization.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Demo script to visualize FC matrices from real fMRI data using nilearn's built-in datasets.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from nilearn import datasets
+from nilearn import input_data, connectome
+from fc_visualization import FCVisualizer
+def visualize_from_nilearn_dataset():
+    """Download and visualize FC matrices from nilearn's ADHD dataset."""
+    print("Downloading a sample fMRI dataset (ADHD)...")
+    adhd_dataset = datasets.fetch_adhd(n_subjects=1)
+    # Get the fMRI file path
+    func_file = adhd_dataset.func[0]
+    confound_file = adhd_dataset.confounds[0]
+    print(f"Downloaded fMRI file: {func_file}")
+    # Get Power atlas coordinates
+    power = datasets.fetch_coords_power_2011()
+    coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+    print(f"Using Power atlas with {len(coords)} ROIs")
+    # Create a masker to extract time series from the ROIs
+    masker = input_data.NiftiSpheresMasker(
+        coords,
+        radius=8,  # 8mm radius
+        standardize=True,
+        memory='nilearn_cache',
+        memory_level=1,
+        verbose=1,
+        detrend=True,
+        low_pass=0.08,
+        high_pass=0.01,
+        t_r=2.0  # ADHD dataset TR
+    )
+    # Extract time series, including confounds
+    print("Extracting time series from ROIs...")
+    time_series = masker.fit_transform(func_file, confounds=confound_file)
+    print(f"Time series shape: {time_series.shape}")
+    # Compute correlation matrix (FC matrix)
+    correlation_measure = connectome.ConnectivityMeasure(
+        kind='correlation',
+        vectorize=False,
+        discard_diagonal=False
+    )
+    fc_matrix = correlation_measure.fit_transform([time_series])[0]
+    print(f"FC matrix shape: {fc_matrix.shape}")
+    # Save the FC matrix for future use
+    np.save('adhd_fc_matrix.npy', fc_matrix)
+    print("Saved FC matrix to adhd_fc_matrix.npy")
+    # Visualize the FC matrix
+    visualizer = FCVisualizer(cmap='RdBu_r', vmin=-1, vmax=1)
+    fig, _ = visualizer.plot_single_matrix(fc_matrix, title="ADHD FC Matrix (Power Atlas)")
+    # Save the figure
+    fig.savefig('adhd_fc_matrix.png', dpi=300, bbox_inches='tight')
+    print("Saved visualization to adhd_fc_matrix.png")
+    # Show the figure
+    plt.show()
+if __name__ == "__main__":
+    visualize_from_nilearn_dataset()

demovae/model.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import random
+import numpy as np
+from sklearn.linear_model import Ridge
+from sklearn.linear_model import LogisticRegression
+def to_torch(x):
+    return torch.from_numpy(x).float()
+def to_cuda(x, use_cuda):
+    if use_cuda:
+        try:
+            return x.cuda()
+        except (RuntimeError, AssertionError) as e:
+            print(f"Warning: CUDA error: {e}. Falling back to CPU.")
+            return x
+    else:
+        return x
+def to_numpy(x):
+    return x.detach().cpu().numpy()
+class VAE(nn.Module):
+    def __init__(self, input_dim, latent_dim, demo_dim, use_cuda=True):
+        super(VAE, self).__init__()
+        self.input_dim = input_dim
+        self.latent_dim = latent_dim
+        self.demo_dim = demo_dim
+        self.use_cuda = use_cuda
+        self.enc1 = to_cuda(nn.Linear(input_dim, 1000).float(), use_cuda)
+        self.enc2 = to_cuda(nn.Linear(1000, latent_dim).float(), use_cuda)
+        self.dec1 = to_cuda(nn.Linear(latent_dim+demo_dim, 1000).float(), use_cuda)
+        self.dec2 = to_cuda(nn.Linear(1000, input_dim).float(), use_cuda)
+    def enc(self, x):
+        x = F.relu(self.enc1(x))
+        z = self.enc2(x)
+        return z
+    def gen(self, n):
+        return to_cuda(torch.randn(n, self.latent_dim).float(), self.use_cuda)
+    def dec(self, z, demo):
+        z = to_cuda(torch.cat([z, demo], dim=1), self.use_cuda)
+        x = F.relu(self.dec1(z))
+        x = self.dec2(x)
+        #x = x.reshape(len(z), 264, 5)
+        #x = torch.einsum('nac,nbc->nab', x, x)
+        #a,b = np.triu_indices(264, 1)
+        #x = x[:,a,b]
+        return x
+def rmse(a, b, mean=torch.mean):
+    return mean((a-b)**2)**0.5
+def latent_loss(z, use_cuda=True):
+    C = z.T@z
+    mu = torch.mean(z, dim=0)
+    tgt1 = to_cuda(torch.eye(z.shape[-1]).float(), use_cuda)*len(z)
+    tgt2 = to_cuda(torch.zeros(z.shape[-1]).float(), use_cuda)
+    loss_C = rmse(C, tgt1)
+    loss_mu = rmse(mu, tgt2)
+    return loss_C, loss_mu, C, mu
+def decor_loss(z, demo, use_cuda=True):
+    ps = []
+    losses = []
+    for di in range(demo.shape[1]):
+        d = demo[:,di]
+        d = d - torch.mean(d)
+        p = torch.einsum('n,nz->z', d, z)
+        p = p/torch.std(d)
+        p = p/torch.einsum('nz,nz->z', z, z)
+        tgt = to_cuda(torch.zeros(z.shape[-1]).float(), use_cuda)
+        loss = rmse(p, tgt)
+        losses.append(loss)
+        ps.append(p)
+    losses = torch.stack(losses)
+    return losses, ps
+def pretty(x):
+    return f'{round(float(x), 4)}'
+def demo_to_torch(demo, demo_types, pred_stats, use_cuda):
+    demo_t = []
+    demo_idx = 0
+    for d,t,s in zip(demo, demo_types, pred_stats):
+        if t == 'continuous':
+            demo_t.append(to_cuda(to_torch(d), use_cuda))
+        elif t == 'categorical':
+            for dd in d:
+                if dd not in s:
+                    print(f'Model not trained with value {dd} for categorical demographic {demo_idx}')
+                    raise Exception('Bad demographic')
+            for ss in s:
+                idx = (d == ss).astype('bool')
+                zeros = torch.zeros(len(d))
+                zeros[idx] = 1
+                demo_t.append(to_cuda(zeros, use_cuda))
+        demo_idx += 1
+    demo_t = torch.stack(demo_t).permute(1,0)
+    return demo_t
+def train_vae(vae, x, demo, demo_types, nepochs, pperiod, bsize, loss_C_mult, loss_mu_mult, loss_rec_mult, loss_decor_mult, loss_pred_mult, lr, weight_decay, alpha, LR_C, ret_obj):
+    # Get linear predictors for demographics
+    pred_w = []
+    pred_i = []
+    # Pred stats are mean and std for continuous, and a list of all values for categorical
+    pred_stats = []
+    for i,d,t in zip(range(len(demo)), demo, demo_types):
+        print(f'Fitting auxilliary guidance model for demographic {i} {t}...', end='')
+        if t == 'continuous':
+            pred_stats.append([np.mean(d), np.std(d)])
+            reg = Ridge(alpha=alpha).fit(x, d)
+            reg_w = to_cuda(to_torch(reg.coef_), vae.use_cuda)
+            reg_i = reg.intercept_
+            pred_w.append(reg_w)
+            pred_i.append(reg_i)
+        elif t == 'categorical':
+            pred_stats.append(sorted(list(set(list(d)))))
+            reg = LogisticRegression(C=LR_C).fit(x, d)
+            # Binary
+            if len(reg.coef_) == 1:
+                reg_w = to_cuda(to_torch(reg.coef_[0]), vae.use_cuda)
+                reg_i = reg.intercept_[0]
+                pred_w.append(-reg_w)
+                pred_i.append(-reg_i)
+                pred_w.append(reg_w)
+                pred_i.append(reg_i)
+            # Categorical
+            else:
+                for i in range(len(reg.coef_)):
+                    reg_w = to_cuda(to_torch(reg.coef_[i]), vae.use_cuda)
+                    reg_i = reg.intercept_[i]
+                    pred_w.append(reg_w)
+                    pred_i.append(reg_i)
+        else:
+            print(f'demographic type "{t}" not "continuous" or "categorical"')
+            raise Exception('Bad demographic type')
+        print(' done')
+    ret_obj.pred_stats = pred_stats
+    # Convert input to pytorch
+    print('Converting input to pytorch')
+    x = to_cuda(to_torch(x), vae.use_cuda)
+    # Convert demographics to pytorch
+    print('Converting demographics to pytorch')
+    demo_t = demo_to_torch(demo, demo_types, pred_stats, vae.use_cuda)
+    # Training loop
+    print('Beginning VAE training')
+    ce = nn.CrossEntropyLoss()
+    optim = torch.optim.Adam(vae.parameters(), lr=lr, weight_decay=weight_decay)
+    for e in range(nepochs):
+        for bs in range(0,len(x),bsize):
+            xb = x[bs:(bs+bsize)]
+            db = demo_t[bs:(bs+bsize)]
+            optim.zero_grad()
+            # Reconstruct
+            z = vae.enc(xb)
+            y = vae.dec(z, db)
+            loss_C, loss_mu, _, _ = latent_loss(z, vae.use_cuda)
+            loss_decor, _ = decor_loss(z, db, vae.use_cuda)
+            loss_decor = sum(loss_decor)
+            loss_rec = rmse(xb, y)
+            # Sample demographics
+            demo_gen = []
+            for s,t in zip(pred_stats, demo_types):
+                if t == 'continuous':
+                    mu = s[0]
+                    std = s[1]
+                    dd = torch.randn(100).float()
+                    dd = dd*std+mu
+                    dd = to_cuda(dd, vae.use_cuda)
+                    demo_gen.append(dd)
+                elif t == 'categorical':
+                    idx = random.randint(0, len(s)-1)
+                    for i in range(len(s)):
+                        if idx == i:
+                            dd = torch.ones(100).float()
+                        else:
+                            dd = torch.zeros(100).float()
+                        dd = to_cuda(dd, vae.use_cuda)
+                        demo_gen.append(dd)
+            demo_gen = torch.stack(demo_gen).permute(1,0)
+            # Generate
+            z = vae.gen(100)
+            y = vae.dec(z, demo_gen)
+            # Regressor/classifier guidance loss
+            losses_pred = []
+            idcs = []
+            dg_idx = 0
+            for s,t in zip(pred_stats, demo_types):
+                if t == 'continuous':
+                    yy = y@pred_w[dg_idx]+pred_i[dg_idx]
+                    loss = rmse(demo_gen[:,dg_idx], yy)
+                    losses_pred.append(loss)
+                    idcs.append(float(demo_gen[0,dg_idx]))
+                    dg_idx += 1
+                elif t == 'categorical':
+                    loss = 0
+                    for i in range(len(s)):
+                        yy = y@pred_w[dg_idx]+pred_i[dg_idx]
+                        loss += ce(torch.stack([-yy, yy], dim=1), demo_gen[:,dg_idx].long())
+                        idcs.append(int(demo_gen[0,dg_idx]))
+                        dg_idx += 1
+                        losses_pred.append(loss)
+            total_loss = loss_C_mult*loss_C + loss_mu_mult*loss_mu + loss_rec_mult*loss_rec + loss_decor_mult*loss_decor + loss_pred_mult*sum(losses_pred)
+            total_loss.backward()
+            optim.step()
+            if e%pperiod == 0 or e == nepochs-1:
+                print(f'Epoch {e} ', end='')
+                print(f'ReconLoss {pretty(loss_rec)} ', end='')
+                print(f'CovarianceLoss {pretty(loss_C)} ', end='')
+                print(f'MeanLoss {pretty(loss_mu)} ', end='')
+                print(f'DecorLoss {pretty(loss_decor)} ', end='')
+                losses_pred = [pretty(loss) for loss in losses_pred]
+                print(f'GuidanceTargets {idcs} GuidanceLosses {losses_pred} ', end='')
+                print()
+    print('Training complete.')

demovae/sklearn.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from .model import VAE, train_vae, to_torch, to_cuda, to_numpy, demo_to_torch
+import numpy as np
+from sklearn.base import BaseEstimator
+# For saving
+import torch
+class DemoVAE(BaseEstimator):
+    def __init__(self, **params):
+        self.set_params(**params)
+    @staticmethod
+    def get_default_params():
+        return dict(latent_dim=60,      # Latent dimension
+                use_cuda=True,          # GPU acceleration
+                nepochs=3000,           # Training epochs
+                pperiod=100,            # Epochs between printing updates
+                bsize=1000,             # Batch size
+                loss_C_mult=1,          # Covariance loss (KL div)
+                loss_mu_mult=1,         # Mean loss (KL div)
+                loss_rec_mult=100,      # Reconstruction loss
+                loss_decor_mult=10,     # Latent-demographic decorrelation loss
+                loss_pred_mult=0.001,   # Classifier/regressor guidance loss
+                alpha=100,              # Regularization for continuous guidance models
+                LR_C=100,               # Regularization for categorical guidance models
+                lr=1e-4,                # Learning rate
+                weight_decay=0,         # L2 regularization for VAE model
+                )
+    def get_params(self, **params):
+        return dict(latent_dim=self.latent_dim,
+                use_cuda=self.use_cuda,
+                nepochs=self.nepochs,
+                pperiod=self.pperiod,
+                bsize=self.bsize,
+                loss_C_mult=self.loss_C_mult,
+                loss_mu_mult=self.loss_mu_mult,
+                loss_rec_mult=self.loss_rec_mult,
+                loss_decor_mult=self.loss_decor_mult,
+                loss_pred_mult=self.loss_pred_mult,
+                alpha=self.alpha,
+                LR_C=self.LR_C,
+                lr=self.lr,
+                weight_decay=self.weight_decay,
+                )
+    def set_params(self, **params):
+        dft = DemoVAE.get_default_params()
+        for key in dft:
+            if key in params:
+                setattr(self, key, params[key])
+            else:
+                setattr(self, key, dft[key])
+        return self
+    def fit(self, x, demo, demo_types, **kwargs):
+        # Get demo_dim
+        demo_dim = 0
+        for d,t in zip(demo, demo_types):
+            if t == 'continuous':
+                demo_dim += 1
+            elif t == 'categorical':
+                ll = len(set(list(d)))
+                if ll == 1:
+                    print('Only one type of category for categorical variable')
+                    raise Exception('Bad categorical')
+                demo_dim += ll
+            else:
+                print(f'demographic type "{t}" not "continuous" or "categorical"')
+                raise Exception('Bad demographic type')
+        # Save parameters
+        self.input_dim = x.shape[1]
+        self.demo_dim = demo_dim
+        # Create model
+        self.vae = VAE(x.shape[1], self.latent_dim, demo_dim, self.use_cuda)
+        # Train model
+        train_vae(self.vae, x, demo, demo_types,
+                self.nepochs, self.pperiod, self.bsize,
+                self.loss_C_mult, self.loss_mu_mult, self.loss_rec_mult, self.loss_decor_mult, self.loss_pred_mult,
+                self.lr, self.weight_decay, self.alpha, self.LR_C,
+                self)
+        return self
+    def transform(self, x, demo, demo_types, **kwargs):
+        if isinstance(x, int):
+            # Generate
+            z = self.vae.gen(x)
+        else:
+            # Get latents for real data
+            z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
+        demo_t = demo_to_torch(demo, demo_types, self.pred_stats, self.vae.use_cuda)
+        y = self.vae.dec(z, demo_t)
+        return to_numpy(y)
+    def fit_transform(self, x, demo, demo_types, **kwargs):
+        self.fit(x, demo, demo_types)
+        return self.transform(x, demo, demo_types)
+    def get_latents(self, x):
+        z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
+        return to_numpy(z)
+    def save(self, path):
+        params = self.get_params()
+        dct = dict(pred_stats=self.pred_stats,
+                   params=params,
+                   input_dim=self.input_dim,
+                   demo_dim=self.demo_dim,
+                   model_state_dict=self.vae.state_dict())
+        torch.save(dct, path)
+    def load(self, path):
+        dct = torch.load(path)
+        self.pred_stats = dct['pred_stats']
+        self.set_params(**dct['params'])
+        self.vae = VAE(dct['input_dim'],
+                       dct['params']['latent_dim'],
+                       dct['demo_dim'],
+                       dct['params']['use_cuda'])
+        self.vae.load_state_dict(dct['model_state_dict'])

fc_visualization.py ADDED Viewed

	@@ -0,0 +1,349 @@

+"""
+FC Matrix Visualization Module.
+This module provides functionality for visualizing Functional Connectivity matrices
+independently from the prediction pipeline.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from pathlib import Path
+import argparse
+import os
+import nibabel as nib
+try:
+    from nilearn import input_data, connectome
+    from nilearn.image import load_img
+    from nilearn import datasets
+    NILEARN_AVAILABLE = True
+except ImportError:
+    NILEARN_AVAILABLE = False
+    print("Warning: nilearn not available. Direct fMRI processing disabled.")
+from config import PREPROCESS_CONFIG
+class FCVisualizer:
+    """Class for visualizing FC matrices."""
+    def __init__(self, cmap='RdBu_r', vmin=-1, vmax=1):
+        """
+        Initialize FCVisualizer with display parameters.
+        Args:
+            cmap: Colormap to use for FC matrices
+            vmin: Minimum value for color scaling
+            vmax: Maximum value for color scaling
+        """
+        self.cmap = cmap
+        self.vmin = vmin
+        self.vmax = vmax
+    def plot_single_matrix(self, matrix, title="FC Matrix", ax=None, fig=None):
+        """
+        Plot a single FC matrix.
+        Args:
+            matrix: 2D numpy array containing FC matrix
+            title: Title for the plot
+            ax: Matplotlib axis to plot on (optional)
+            fig: Matplotlib figure (optional)
+        Returns:
+            fig, ax: The figure and axis objects
+        """
+        if ax is None:
+            fig, ax = plt.subplots(figsize=(8, 6))
+        im = ax.imshow(matrix, cmap=self.cmap, vmin=self.vmin, vmax=self.vmax)
+        ax.set_title(title)
+        plt.colorbar(im, ax=ax)
+        return fig, ax
+    def plot_matrix_comparison(self, matrices, titles=None, figsize=None):
+        """
+        Plot multiple FC matrices for comparison.
+        Args:
+            matrices: List of 2D numpy arrays containing FC matrices
+            titles: List of titles for each matrix (optional)
+            figsize: Custom figure size (optional)
+        Returns:
+            fig: The figure object
+        """
+        n_matrices = len(matrices)
+        if figsize is None:
+            figsize = (5*n_matrices, 5)
+        if titles is None:
+            titles = [f"FC Matrix {i+1}" for i in range(n_matrices)]
+        fig, axes = plt.subplots(1, n_matrices, figsize=figsize)
+        # Handle single matrix case
+        if n_matrices == 1:
+            axes = [axes]
+        for i, (matrix, title) in enumerate(zip(matrices, titles)):
+            im = axes[i].imshow(matrix, cmap=self.cmap, vmin=self.vmin, vmax=self.vmax)
+            axes[i].set_title(title)
+            plt.colorbar(im, ax=axes[i])
+        plt.tight_layout()
+        return fig
+    def load_and_visualize_npy(self, file_path):
+        """
+        Load and visualize an FC matrix from a .npy file.
+        Args:
+            file_path: Path to the .npy file
+        Returns:
+            fig: The figure object containing the visualization
+        """
+        # Load the matrix
+        data = np.load(file_path)
+        # Check if it's an upper triangle or full matrix
+        if len(data.shape) == 1:
+            # Convert upper triangular to full matrix
+            matrix = self._triu_to_matrix(data)
+        else:
+            matrix = data
+        # Plot the matrix
+        filename = os.path.basename(file_path)
+        title = f"FC Matrix: {filename}"
+        fig, _ = self.plot_single_matrix(matrix, title=title)
+        return fig
+    def _triu_to_matrix(self, triu_values, fisher_z=True):
+        """
+        Convert upper triangular values to a full FC matrix.
+        Args:
+            triu_values: 1D array of upper triangular values
+            fisher_z: Whether values are Fisher z-transformed
+        Returns:
+            full_matrix: 2D symmetric matrix
+        """
+        # Calculate matrix size from triu length
+        n = int(np.sqrt(2 * len(triu_values) + 0.25) + 0.5)
+        # Initialize empty matrix
+        matrix = np.zeros((n, n))
+        # Get indices for upper triangle
+        triu_indices = np.triu_indices_from(matrix, k=1)
+        # If Fisher z-transformed, convert back
+        if fisher_z:
+            values_to_set = np.tanh(triu_values)
+        else:
+            values_to_set = triu_values
+        # Set upper triangle values
+        matrix[triu_indices] = values_to_set
+        # Make symmetric
+        matrix = matrix + matrix.T
+        # Set diagonal to 1.0 (perfect correlation)
+        np.fill_diagonal(matrix, 1.0)
+        return matrix
+    def process_and_visualize_fmri(self, fmri_file):
+        """
+        Process an fMRI file and visualize its FC matrix.
+        Args:
+            fmri_file: Path to the fMRI .nii or .nii.gz file
+        Returns:
+            fig: The figure object containing the visualization,
+                 or None if processing fails
+        """
+        if not NILEARN_AVAILABLE:
+            print("Error: nilearn is required for fMRI processing")
+            return None
+        try:
+            # Extract FC matrix (upper triangular values)
+            fc_triu = self._process_single_fmri(fmri_file)
+            # Convert to full matrix
+            fc_matrix = self._triu_to_matrix(fc_triu)
+            # Plot the matrix
+            filename = os.path.basename(fmri_file)
+            title = f"FC Matrix: {filename}"
+            fig, _ = self.plot_single_matrix(fc_matrix, title=title)
+            return fig
+        except Exception as e:
+            print(f"Error processing fMRI file: {e}")
+            return None
+    def _process_single_fmri(self, fmri_file):
+        """
+        Process a single fMRI file to FC matrix.
+        Args:
+            fmri_file: Path to the fMRI .nii or .nii.gz file
+        Returns:
+            fc_triu: 1D array of upper triangular values (Fisher z-transformed)
+        """
+        print(f"Processing fMRI file: {fmri_file}")
+        # Use Power 264 atlas
+        power = datasets.fetch_coords_power_2011()
+        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+        # Create masker
+        masker = input_data.NiftiSpheresMasker(
+            coords,
+            radius=PREPROCESS_CONFIG['radius'],
+            standardize=True,
+            memory='nilearn_cache',
+            memory_level=1,
+            verbose=0,
+            detrend=True,
+            low_pass=PREPROCESS_CONFIG['low_pass'],
+            high_pass=PREPROCESS_CONFIG['high_pass'],
+            t_r=PREPROCESS_CONFIG['t_r']
+        )
+        # Load and process fMRI
+        print(f"Loading NIfTI file...")
+        fmri_img = load_img(fmri_file)
+        print(f"NIfTI file loaded, shape: {fmri_img.shape}")
+        # Transform to time series
+        print(f"Extracting time series...")
+        time_series = masker.fit_transform(fmri_img)
+        print(f"Time series extracted, shape: {time_series.shape}")
+        # Compute FC matrix
+        print(f"Computing FC matrix...")
+        correlation_measure = connectome.ConnectivityMeasure(
+            kind='correlation',
+            vectorize=False,
+            discard_diagonal=False
+        )
+        fc_matrix = correlation_measure.fit_transform([time_series])[0]
+        print(f"FC matrix computed, shape: {fc_matrix.shape}")
+        # Get upper triangular part
+        triu_indices = np.triu_indices_from(fc_matrix, k=1)
+        fc_triu = fc_matrix[triu_indices]
+        # Fisher z-transform
+        fc_triu = np.arctanh(np.clip(fc_triu, -0.99, 0.99))  # Clip to avoid infinite values
+        print(f"Processing complete. FC features shape: {fc_triu.shape}")
+        return fc_triu
+def create_synthetic_fc_matrix(seed=None):
+    """
+    Create a synthetic FC matrix for demonstration purposes.
+    Args:
+        seed: Random seed for reproducibility
+    Returns:
+        matrix: 2D symmetric matrix representing FC
+    """
+    if seed is not None:
+        np.random.seed(seed)
+    # Number of ROIs (Power atlas has 264)
+    n_rois = 264
+    # Create random correlation matrix
+    # Method: generate random normal values, create outer product, normalize
+    random_vectors = np.random.randn(n_rois, 50)  # 50 random features
+    matrix = np.corrcoef(random_vectors)
+    # Ensure it's in the range [-1, 1] with 1s on diagonal
+    np.fill_diagonal(matrix, 1.0)
+    return matrix
+def main():
+    """Command-line interface for FC matrix visualization."""
+    parser = argparse.ArgumentParser(description='Visualize FC matrices')
+    parser.add_argument('--input', type=str, help='Input file (fMRI .nii/.nii.gz or .npy FC matrix)')
+    parser.add_argument('--output', type=str, help='Output image file (PNG/JPG/PDF)')
+    parser.add_argument('--cmap', type=str, default='RdBu_r', help='Colormap (default: RdBu_r)')
+    parser.add_argument('--vmin', type=float, default=-1, help='Minimum value for colormap')
+    parser.add_argument('--vmax', type=float, default=1, help='Maximum value for colormap')
+    parser.add_argument('--synthetic', action='store_true', help='Generate a synthetic FC matrix')
+    parser.add_argument('--seed', type=int, default=42, help='Random seed for synthetic data')
+    args = parser.parse_args()
+    # Create visualizer
+    visualizer = FCVisualizer(cmap=args.cmap, vmin=args.vmin, vmax=args.vmax)
+    # Determine figure to create
+    fig = None
+    if args.synthetic:
+        # Create synthetic FC matrix
+        matrix = create_synthetic_fc_matrix(seed=args.seed)
+        fig, _ = visualizer.plot_single_matrix(matrix, title="Synthetic FC Matrix")
+    elif args.input:
+        input_path = Path(args.input)
+        if not input_path.exists():
+            print(f"Error: Input file not found: {args.input}")
+            return
+        # Process based on file type
+        if input_path.suffix == '.npy':
+            # It's a numpy file with FC matrix
+            fig = visualizer.load_and_visualize_npy(input_path)
+        elif input_path.suffix == '.nii' or input_path.suffix == '.gz':
+            # It's an fMRI file
+            if not NILEARN_AVAILABLE:
+                print("Error: nilearn is required for processing fMRI files")
+                return
+            fig = visualizer.process_and_visualize_fmri(input_path)
+        else:
+            print(f"Error: Unsupported file format: {input_path.suffix}")
+            print("Supported formats: .npy (FC matrix), .nii/.nii.gz (fMRI)")
+            return
+    else:
+        # No input or synthetic flag - show demo
+        print("No input file or --synthetic flag provided. Generating a demo matrix.")
+        matrix = create_synthetic_fc_matrix(seed=args.seed)
+        fig, _ = visualizer.plot_single_matrix(matrix, title="Demo FC Matrix")
+    # Save or display the figure
+    if fig is not None:
+        if args.output:
+            fig.savefig(args.output, dpi=300, bbox_inches='tight')
+            print(f"Visualization saved to {args.output}")
+        else:
+            plt.show()
+            print("Visualization displayed. Close the window to exit.")
+    else:
+        print("Error: Failed to create visualization")
+if __name__ == "__main__":
+    main()

huggingface_fc_visualization.py ADDED Viewed

	@@ -0,0 +1,489 @@

+"""
+Script to visualize FC matrices from HuggingFace dataset, comparing original FC to VAE-generated FC.
+"""
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from datasets import load_dataset
+from fc_visualization import FCVisualizer
+from pathlib import Path
+import tempfile
+import requests
+from config import DATASET_CONFIG, PREPROCESS_CONFIG, MODEL_CONFIG
+from data_preprocessing import process_single_fmri
+from vae_model import VariationalAutoencoder
+def download_sample_fmri(dataset, temp_dir, max_samples=5):
+    """
+    Download sample fMRI files from HuggingFace dataset.
+    Args:
+        dataset: HuggingFace dataset object
+        temp_dir: Directory to save downloaded files
+        max_samples: Maximum number of samples to download
+    Returns:
+        list of paths to downloaded files, demographic data, and file keys
+    """
+    # Get first few samples to search for NIfTI files
+    nifti_keys = []
+    # Look through dataset features to find NIfTI files
+    for i, sample in enumerate(dataset):
+        if i >= 5:  # Check first 5 samples
+            break
+        for key, value in sample.items():
+            if isinstance(value, str) and (value.endswith('.nii') or value.endswith('.nii.gz')):
+                if key not in nifti_keys:
+                    nifti_keys.append(key)
+    print(f"Found {len(nifti_keys)} NIfTI file types in the dataset: {nifti_keys}")
+    if not nifti_keys:
+        print("No NIfTI files found in the dataset")
+        return [], [], []
+    # Collect nifti files and demographics
+    nifti_files = []
+    demo_data = []
+    # Process a limited number of samples
+    num_samples = min(max_samples, len(dataset))
+    for sample_idx in range(num_samples):
+        sample = dataset[sample_idx]
+        for key in nifti_keys:
+            try:
+                file_url = sample[key]
+                if not file_url or not isinstance(file_url, str):
+                    continue
+                print(f"Processing sample {sample_idx+1}, file: {key}")
+                # Download and save the file
+                local_file = os.path.join(temp_dir, f"sample_{sample_idx}_{key}.nii.gz")
+                print(f"Downloading {file_url} to {local_file}")
+                response = requests.get(file_url)
+                with open(local_file, 'wb') as f:
+                    f.write(response.content)
+                nifti_files.append(local_file)
+                # Extract demo data if available (or use placeholders)
+                age = sample.get('age', 65.0) if 'age' in sample else 65.0
+                sex = sample.get('sex', 'M') if 'sex' in sample else 'M'
+                mpo = sample.get('months_post_onset', 12.0) if 'months_post_onset' in sample else 12.0
+                wab = sample.get('wab_aq', 50.0) if 'wab_aq' in sample else 50.0
+                demo_sample = [age, sex, mpo, wab]
+                demo_data.append(demo_sample)
+            except Exception as e:
+                print(f"Error processing sample {sample_idx}, {key}: {e}")
+    return nifti_files, demo_data, nifti_keys
+class VariationalAutoencoder:
+    """
+    Simplified VAE implementation for the visualization script.
+    """
+    def __init__(self, n_features, latent_dim, demo_data, demo_types, **kwargs):
+        """
+        Initialize the VAE.
+        Args:
+            n_features: Number of input features
+            latent_dim: Dimension of latent space
+            demo_data: Demographic data
+            demo_types: Types of demographic variables
+            **kwargs: Additional parameters
+        """
+        import torch
+        import torch.nn as nn
+        self.n_features = n_features
+        self.latent_dim = latent_dim
+        self.demo_dim = self._calculate_demo_dim(demo_data, demo_types)
+        self.nepochs = kwargs.get('nepochs', 100)
+        self.batch_size = kwargs.get('bsize', 8)
+        self.learning_rate = kwargs.get('lr', 1e-3)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Build VAE model
+        self.encoder = nn.Sequential(
+            nn.Linear(n_features, 512),
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, latent_dim * 2)  # mu and logvar
+        ).to(self.device)
+        self.decoder = nn.Sequential(
+            nn.Linear(latent_dim + self.demo_dim, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 512),
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Linear(512, n_features)
+        ).to(self.device)
+        self.optimizer = torch.optim.Adam(
+            list(self.encoder.parameters()) + list(self.decoder.parameters()),
+            lr=self.learning_rate
+        )
+        self.demo_stats = None  # Will be set during training
+    def _calculate_demo_dim(self, demo_data, demo_types):
+        """Calculate dimension of demographic data after one-hot encoding"""
+        demo_dim = 0
+        for d, t in zip(demo_data, demo_types):
+            if t == 'continuous':
+                demo_dim += 1
+            elif t == 'categorical':
+                if isinstance(d[0], str):
+                    # Get unique categories
+                    unique_values = list(set(d))
+                    demo_dim += len(unique_values)
+                else:
+                    demo_dim += len(set(d))
+        return demo_dim
+    def _encode(self, x):
+        """Encode input data to latent space"""
+        import torch
+        x_tensor = torch.tensor(x, dtype=torch.float32).to(self.device)
+        h = self.encoder(x_tensor)
+        mu, logvar = h[:, :self.latent_dim], h[:, self.latent_dim:]
+        return mu, logvar
+    def _reparameterize(self, mu, logvar):
+        """Reparameterization trick for sampling from latent space"""
+        import torch
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        z = mu + eps * std
+        return z
+    def _decode(self, z, demo):
+        """Decode latent representation back to input space"""
+        import torch
+        # Concatenate latent code with demographic data
+        z_concat = torch.cat([z, demo], dim=1)
+        return self.decoder(z_concat)
+    def _prepare_demographics(self, demo_data, demo_types):
+        """Convert demographics to tensor with one-hot encoding for categorical variables"""
+        import torch
+        import numpy as np
+        if self.demo_stats is None:
+            # First time - compute stats
+            self.demo_stats = []
+            for d, t in zip(demo_data, demo_types):
+                if t == 'continuous':
+                    # Standardize continuous features
+                    self.demo_stats.append(('continuous', (np.mean(d), np.std(d))))
+                elif t == 'categorical':
+                    # Record unique values for one-hot encoding
+                    if isinstance(d[0], str):
+                        unique_values = sorted(list(set(d)))
+                    else:
+                        unique_values = sorted(list(set(d)))
+                    self.demo_stats.append(('categorical', unique_values))
+        # Process demographics based on saved stats
+        demo_tensors = []
+        for (d, (dtype, stats)) in zip(demo_data, self.demo_stats):
+            if dtype == 'continuous':
+                mean, std = stats
+                # Standardize
+                standardized = (np.array(d) - mean) / (std + 1e-10)
+                demo_tensors.append(torch.tensor(standardized, dtype=torch.float32).reshape(-1, 1))
+            else:  # categorical
+                unique_values = stats
+                # One-hot encode
+                one_hot_vectors = []
+                for val in d:
+                    try:
+                        idx = unique_values.index(val)
+                        vec = [0.0] * len(unique_values)
+                        vec[idx] = 1.0
+                        one_hot_vectors.append(vec)
+                    except ValueError:
+                        # Handle unseen categories - use all zeros
+                        vec = [0.0] * len(unique_values)
+                        one_hot_vectors.append(vec)
+                demo_tensors.append(torch.tensor(one_hot_vectors, dtype=torch.float32))
+        # Concatenate all demographic features
+        return torch.cat(demo_tensors, dim=1).to(self.device)
+    def fit(self, X, demo_data, demo_types):
+        """
+        Train the VAE model.
+        Args:
+            X: Input data (FC matrices)
+            demo_data: List of demographic variables
+            demo_types: Types of demographic variables
+        """
+        import torch
+        import torch.nn.functional as F
+        import numpy as np
+        from torch.utils.data import DataLoader, TensorDataset
+        print(f"Training VAE on {len(X)} samples for {self.nepochs} epochs...")
+        # Prepare demographic data
+        demo_tensor = self._prepare_demographics(demo_data, demo_types)
+        # Convert input data to tensor
+        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
+        # Create dataset and dataloader
+        dataset = TensorDataset(X_tensor, demo_tensor)
+        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
+        # Training loop
+        self.train_losses = []
+        for epoch in range(self.nepochs):
+            epoch_losses = []
+            for batch_x, batch_demo in dataloader:
+                # Forward pass
+                mu, logvar = self._encode(batch_x)
+                z = self._reparameterize(mu, logvar)
+                x_recon = self._decode(z, batch_demo)
+                # Compute loss
+                recon_loss = F.mse_loss(x_recon, batch_x)
+                kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
+                kl_loss = kl_loss / batch_x.size(0)  # Normalize by batch size
+                # Total loss
+                loss = recon_loss + 0.1 * kl_loss
+                # Backward and optimize
+                self.optimizer.zero_grad()
+                loss.backward()
+                self.optimizer.step()
+                epoch_losses.append(loss.item())
+            # Record average loss for this epoch
+            avg_loss = np.mean(epoch_losses)
+            self.train_losses.append(avg_loss)
+            # Print progress every 10 epochs
+            if (epoch + 1) % 10 == 0:
+                print(f"Epoch {epoch+1}/{self.nepochs}, Loss: {avg_loss:.6f}")
+        print("VAE training complete!")
+        return self.train_losses
+    def reconstruct(self, X, demo_data=None, demo_types=None):
+        """
+        Reconstruct input data.
+        Args:
+            X: Input data
+            demo_data: Demographic data (optional)
+            demo_types: Types of demographic variables (optional)
+        Returns:
+            Reconstructed data
+        """
+        import torch
+        # Set to evaluation mode
+        self.encoder.eval()
+        self.decoder.eval()
+        with torch.no_grad():
+            # Encode to latent space
+            mu, _ = self._encode(X)
+            # Use demo data if provided, otherwise use the demo data from training
+            if demo_data is not None and demo_types is not None:
+                demo_tensor = self._prepare_demographics(demo_data, demo_types)
+            else:
+                # This would fail if model wasn't trained
+                raise ValueError("Demo data and types must be provided for reconstruction")
+            # Decode
+            recon = self._decode(mu, demo_tensor)
+            # Convert to numpy
+            return recon.cpu().numpy()
+    def generate(self, n_samples, demo_data, demo_types):
+        """
+        Generate new samples from the latent space.
+        Args:
+            n_samples: Number of samples to generate
+            demo_data: Demographic data
+            demo_types: Types of demographic variables
+        Returns:
+            Generated samples
+        """
+        import torch
+        # Set to evaluation mode
+        self.decoder.eval()
+        with torch.no_grad():
+            # Sample from standard normal
+            z = torch.randn(n_samples, self.latent_dim).to(self.device)
+            # Prepare demographic data
+            demo_tensor = self._prepare_demographics(demo_data, demo_types)
+            # Check dimensions
+            if demo_tensor.shape[0] != n_samples:
+                # Handle mismatch - repeat the first demographic sample
+                if demo_tensor.shape[0] >= 1:
+                    demo_tensor = demo_tensor[0].unsqueeze(0).repeat(n_samples, 1)
+            # Generate samples
+            generated = self._decode(z, demo_tensor)
+            # Convert to numpy
+            return generated.cpu().numpy()
+def generate_comparison():
+    """Download, process and visualize FC matrices from the HuggingFace dataset,
+    comparing original to VAE-generated matrices."""
+    print("Loading dataset from HuggingFace...")
+    # Load the HuggingFace dataset using config
+    dataset_name = DATASET_CONFIG.get('name', 'SreekarB/OSFData')
+    dataset_split = DATASET_CONFIG.get('split', 'train')
+    dataset = load_dataset(dataset_name, split=dataset_split)
+    print(f"Dataset loaded: {dataset}")
+    # Create temporary directory for downloaded NIfTI files
+    temp_dir = tempfile.mkdtemp(prefix="hf_nifti_")
+    print(f"Created temp directory for NIfTI files: {temp_dir}")
+    # Download and process fMRI files
+    nifti_files, demo_samples, nifti_keys = download_sample_fmri(dataset, temp_dir, max_samples=5)
+    if not nifti_files:
+        print("No valid fMRI files were found")
+        return
+    # Process all fMRI files to FC matrices
+    fc_matrices = []
+    demo_data = []
+    for file_idx, (file_path, demo_sample) in enumerate(zip(nifti_files, demo_samples)):
+        try:
+            print(f"Processing file {file_idx+1}/{len(nifti_files)}: {file_path}")
+            fc_triu = process_single_fmri(file_path)
+            fc_matrices.append(fc_triu)
+            demo_data.append(demo_sample)
+        except Exception as e:
+            print(f"Error processing file {file_path}: {e}")
+    if not fc_matrices:
+        print("No valid FC matrices were generated")
+        return
+    # Convert to numpy arrays
+    X = np.array(fc_matrices)
+    # Normalize the data
+    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
+    # Prepare demographic data
+    # Transpose to get [feature_type][sample] format
+    demo_data = np.array(demo_data).T.tolist()
+    demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+    # Train a VAE on the FC matrices
+    print("Training VAE on the FC matrices...")
+    n_features = X.shape[1]
+    # Configure a smaller/faster VAE for demonstration
+    vae = VariationalAutoencoder(
+        n_features=n_features,
+        latent_dim=MODEL_CONFIG.get('latent_dim', 32),
+        demo_data=demo_data,
+        demo_types=demo_types,
+        nepochs=100,  # Reduced for demo
+        bsize=2,
+        lr=1e-3
+    )
+    # Train the VAE
+    vae.fit(X, demo_data, demo_types)
+    # Generate reconstructed FC matrices
+    print("Generating reconstructed FC matrices...")
+    reconstructed = vae.reconstruct(X, demo_data, demo_types)
+    # Generate a synthetic FC matrix
+    print("Generating a synthetic FC matrix...")
+    # For generating a new sample, we'll use demographics from first patient
+    first_demo_data = [[d[0]] for d in demo_data]
+    generated = vae.generate(1, first_demo_data, demo_types)
+    # Visualize original, reconstructed, and generated FC matrices
+    visualizer = FCVisualizer()
+    # Process each sample to generate comparisons
+    for i in range(min(3, len(X))):
+        # Convert upper triangular vectors to full matrices for visualization
+        original_matrix = visualizer._triu_to_matrix(X[i])
+        recon_matrix = visualizer._triu_to_matrix(reconstructed[i])
+        # Use the generate method for a single synthetic sample
+        if i == 0:
+            gen_matrix = visualizer._triu_to_matrix(generated[0])
+            # Visualize all three - original, reconstructed, generated
+            fig = visualizer.plot_matrix_comparison(
+                [original_matrix, recon_matrix, gen_matrix],
+                titles=["Original FC", "Reconstructed FC", "Generated FC"]
+            )
+            output_file = f"fc_comparison_with_generated.png"
+            fig.savefig(output_file, dpi=300, bbox_inches='tight')
+            print(f"Saved full comparison to {output_file}")
+        # Visualize original vs reconstructed for each sample
+        fig = visualizer.plot_matrix_comparison(
+            [original_matrix, recon_matrix],
+            titles=[f"Original FC (Sample {i+1})", f"Reconstructed FC (Sample {i+1})"]
+        )
+        output_file = f"sample_{i}_original_vs_reconstructed.png"
+        fig.savefig(output_file, dpi=300, bbox_inches='tight')
+        print(f"Saved comparison to {output_file}")
+        # Save the matrices
+        np.save(f"sample_{i}_original_fc.npy", original_matrix)
+        np.save(f"sample_{i}_reconstructed_fc.npy", recon_matrix)
+    # Save the generated matrix
+    np.save("generated_fc.npy", gen_matrix)
+    print("Processing complete")
+if __name__ == "__main__":
+    generate_comparison()

main.py CHANGED Viewed

@@ -99,7 +99,6 @@ def run_analysis(data_dir="data",
     # Initialize and train treatment predictor
     print("Training treatment predictor...")
     predictor = AphasiaTreatmentPredictor(
-        prediction_type=PREDICTION_CONFIG.get('prediction_type', 'regression'),
         n_estimators=PREDICTION_CONFIG.get('n_estimators', 100),
         max_depth=PREDICTION_CONFIG.get('max_depth', None)
     )
@@ -129,18 +128,11 @@ def run_analysis(data_dir="data",
     # For regression, get R2 metrics, otherwise use accuracy
     try:
-        if predictor.prediction_type == "regression":
-            cv_mean = mean_metrics.get("r2", 0.0)
-            if fold_metrics and "r2" in fold_metrics[0]:
-                cv_std = np.std([fold.get("r2", 0.0) for fold in fold_metrics])
-            else:
-                cv_std = 0.0
         else:
-            cv_mean = mean_metrics.get("accuracy", 0.0)
-            if fold_metrics and "accuracy" in fold_metrics[0]:
-                cv_std = np.std([fold.get("accuracy", 0.0) for fold in fold_metrics])
-            else:
-                cv_std = 0.0
     except Exception as e:
         print(f"Error calculating CV metrics: {e}")
         cv_mean, cv_std = 0.0, 0.0

     # Initialize and train treatment predictor
     print("Training treatment predictor...")
     predictor = AphasiaTreatmentPredictor(
         n_estimators=PREDICTION_CONFIG.get('n_estimators', 100),
         max_depth=PREDICTION_CONFIG.get('max_depth', None)
     )
     # For regression, get R2 metrics, otherwise use accuracy
     try:
+        cv_mean = mean_metrics.get("r2", 0.0)
+        if fold_metrics and "r2" in fold_metrics[0]:
+            cv_std = np.std([fold.get("r2", 0.0) for fold in fold_metrics])
         else:
+            cv_std = 0.0
     except Exception as e:
         print(f"Error calculating CV metrics: {e}")
         cv_mean, cv_std = 0.0, 0.0

rcf_prediction.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import numpy as np
-from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 from sklearn.model_selection import cross_val_score, KFold
 import pandas as pd
-from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
 import matplotlib.pyplot as plt
 import os
 import joblib
@@ -12,35 +12,27 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 logger = logging.getLogger(__name__)
 class AphasiaTreatmentPredictor:
-    def __init__(self, prediction_type="regression", n_estimators=100, max_depth=None, random_state=42):
         """
-        Initialize the Treatment Predictor with Random Forest
         Args:
-            prediction_type (str): "classification" or "regression" depending on outcome variable type
             n_estimators (int): Number of trees in the forest
             max_depth (int): Maximum depth of trees (None for unlimited)
             random_state (int): Random seed for reproducibility
         """
-        self.prediction_type = prediction_type
         self.n_estimators = n_estimators
         self.max_depth = max_depth
         self.random_state = random_state
         self.feature_importance = None
         self.feature_names = None
-        if prediction_type == "classification":
-            self.model = RandomForestClassifier(
-                n_estimators=n_estimators,
-                max_depth=max_depth,
-                random_state=random_state
-            )
-        else:  # regression
-            self.model = RandomForestRegressor(
-                n_estimators=n_estimators,
-                max_depth=max_depth,
-                random_state=random_state
-            )
     def prepare_features(self, latents, demographics):
         """
@@ -115,34 +107,11 @@ class AphasiaTreatmentPredictor:
         predictions = self.model.predict(X)
         # Get prediction intervals using tree variance
-        if self.prediction_type == "regression":
-            tree_predictions = np.array([tree.predict(X)
-                                      for tree in self.model.estimators_])
-            prediction_std = np.std(tree_predictions, axis=0)
-        else:  # classification
-            # For classification, use probability as a measure of confidence
-            proba = self.model.predict_proba(X)
-            # Use max probability as confidence measure
-            prediction_std = 1 - np.max(proba, axis=1)
         return predictions, prediction_std
-    def predict_proba(self, latents, demographics):
-        """
-        Get probability estimates for classification
-        Args:
-            latents (np.ndarray): Latent representations from VAE
-            demographics (dict or pd.DataFrame): Demographic information
-        Returns:
-            np.ndarray: Probability estimates for each class
-        """
-        if self.prediction_type != "classification":
-            raise ValueError("Probability prediction only available for classification")
-        X, _ = self.prepare_features(latents, demographics)
-        return self.model.predict_proba(X)
     def cross_validate(self, latents, demographics, treatment_outcomes, n_splits=5):
         """
@@ -174,18 +143,11 @@ class AphasiaTreatmentPredictor:
             y_train, y_test = treatment_outcomes[train_idx], treatment_outcomes[test_idx]
             # Clone the model for this fold
-            if self.prediction_type == "classification":
-                fold_model = RandomForestClassifier(
-                    n_estimators=self.n_estimators,
-                    max_depth=self.max_depth,
-                    random_state=self.random_state
-                )
-            else:
-                fold_model = RandomForestRegressor(
-                    n_estimators=self.n_estimators,
-                    max_depth=self.max_depth,
-                    random_state=self.random_state
-                )
             # Train the model
             fold_model.fit(X_train, y_train)
@@ -197,38 +159,19 @@ class AphasiaTreatmentPredictor:
             predictions[test_idx] = pred
             # Calculate metrics
-            if self.prediction_type == "regression":
-                rmse = np.sqrt(mean_squared_error(y_test, pred))
-                r2 = r2_score(y_test, pred)
-                metrics = {
-                    "r2": r2,
-                    "rmse": rmse,
-                    "mse": rmse**2
-                }
-                # Get prediction intervals using tree variance
-                tree_predictions = np.array([tree.predict(X_test)
-                                          for tree in fold_model.estimators_])
-                pred_std = np.std(tree_predictions, axis=0)
-                prediction_stds[test_idx] = pred_std
-            else:  # classification
-                acc = accuracy_score(y_test, pred)
-                prec = precision_score(y_test, pred, average='weighted', zero_division=0)
-                rec = recall_score(y_test, pred, average='weighted', zero_division=0)
-                f1 = f1_score(y_test, pred, average='weighted', zero_division=0)
-                metrics = {
-                    "accuracy": acc,
-                    "precision": prec,
-                    "recall": rec,
-                    "f1": f1
-                }
-                # Use probability as a measure of confidence
-                proba = fold_model.predict_proba(X_test)
-                # Use max probability as confidence measure
-                pred_std = 1 - np.max(proba, axis=1)
-                prediction_stds[test_idx] = pred_std
             fold_metrics.append(metrics)
             logger.info(f"Fold {fold+1} metrics: {metrics}")
@@ -335,7 +278,6 @@ class AphasiaTreatmentPredictor:
         # Create new instance
         predictor = cls(
-            prediction_type=data['prediction_type'],
             n_estimators=data['n_estimators'],
             max_depth=data['max_depth'],
             random_state=data['random_state']
@@ -350,7 +292,7 @@ class AphasiaTreatmentPredictor:
         return predictor
-def train_predictor_from_latents(latents, outcomes, demographics=None, prediction_type="regression", cv=5, **kwargs):
     """
     Train a treatment outcome predictor from VAE latent representations
@@ -358,17 +300,16 @@ def train_predictor_from_latents(latents, outcomes, demographics=None, predictio
         latents (np.ndarray): Latent representations from VAE
         outcomes (np.ndarray): Treatment outcome values
         demographics (dict or pd.DataFrame, optional): Demographic information to include as features
-        prediction_type (str): "classification" or "regression"
         cv (int): Number of folds for cross-validation
         **kwargs: Additional parameters for the AphasiaTreatmentPredictor
     Returns:
         dict: Training results and trained model
     """
-    logger.info(f"Training {prediction_type} model for treatment prediction")
     # Create predictor
-    predictor = AphasiaTreatmentPredictor(prediction_type=prediction_type, **kwargs)
     # Run cross-validation
     cv_results = predictor.cross_validate(latents, demographics, outcomes, n_splits=cv)

 import numpy as np
+from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import cross_val_score, KFold
 import pandas as pd
+from sklearn.metrics import mean_squared_error, r2_score
 import matplotlib.pyplot as plt
 import os
 import joblib
 logger = logging.getLogger(__name__)
 class AphasiaTreatmentPredictor:
+    def __init__(self, n_estimators=100, max_depth=None, random_state=42):
         """
+        Initialize the Treatment Predictor with Random Forest Regressor
         Args:
             n_estimators (int): Number of trees in the forest
             max_depth (int): Maximum depth of trees (None for unlimited)
             random_state (int): Random seed for reproducibility
         """
+        self.prediction_type = "regression"
         self.n_estimators = n_estimators
         self.max_depth = max_depth
         self.random_state = random_state
         self.feature_importance = None
         self.feature_names = None
+        self.model = RandomForestRegressor(
+            n_estimators=n_estimators,
+            max_depth=max_depth,
+            random_state=random_state
+        )
     def prepare_features(self, latents, demographics):
         """
         predictions = self.model.predict(X)
         # Get prediction intervals using tree variance
+        tree_predictions = np.array([tree.predict(X)
+                                  for tree in self.model.estimators_])
+        prediction_std = np.std(tree_predictions, axis=0)
         return predictions, prediction_std
     def cross_validate(self, latents, demographics, treatment_outcomes, n_splits=5):
         """
             y_train, y_test = treatment_outcomes[train_idx], treatment_outcomes[test_idx]
             # Clone the model for this fold
+            fold_model = RandomForestRegressor(
+                n_estimators=self.n_estimators,
+                max_depth=self.max_depth,
+                random_state=self.random_state
+            )
             # Train the model
             fold_model.fit(X_train, y_train)
             predictions[test_idx] = pred
             # Calculate metrics
+            rmse = np.sqrt(mean_squared_error(y_test, pred))
+            r2 = r2_score(y_test, pred)
+            metrics = {
+                "r2": r2,
+                "rmse": rmse,
+                "mse": rmse**2
+            }
+            # Get prediction intervals using tree variance
+            tree_predictions = np.array([tree.predict(X_test)
+                                      for tree in fold_model.estimators_])
+            pred_std = np.std(tree_predictions, axis=0)
+            prediction_stds[test_idx] = pred_std
             fold_metrics.append(metrics)
             logger.info(f"Fold {fold+1} metrics: {metrics}")
         # Create new instance
         predictor = cls(
             n_estimators=data['n_estimators'],
             max_depth=data['max_depth'],
             random_state=data['random_state']
         return predictor
+def train_predictor_from_latents(latents, outcomes, demographics=None, cv=5, **kwargs):
     """
     Train a treatment outcome predictor from VAE latent representations
         latents (np.ndarray): Latent representations from VAE
         outcomes (np.ndarray): Treatment outcome values
         demographics (dict or pd.DataFrame, optional): Demographic information to include as features
         cv (int): Number of folds for cross-validation
         **kwargs: Additional parameters for the AphasiaTreatmentPredictor
     Returns:
         dict: Training results and trained model
     """
+    logger.info(f"Training regression model for treatment prediction")
     # Create predictor
+    predictor = AphasiaTreatmentPredictor(**kwargs)
     # Run cross-validation
     cv_results = predictor.cross_validate(latents, demographics, outcomes, n_splits=cv)

src/.DS_Store CHANGED Viewed

Binary files a/src/.DS_Store and b/src/.DS_Store differ

vae_model.py CHANGED Viewed

@@ -26,17 +26,33 @@ class VAE(nn.Module):
         self.bn2 = to_cuda(nn.BatchNorm1d(1000), use_cuda)
     def enc(self, x):
-        x = self.bn1(F.relu(self.enc1(x)))
-        z = self.enc2(x)
         return z
     def gen(self, n):
         return to_cuda(torch.randn(n, self.latent_dim).float(), self.use_cuda)
     def dec(self, z, demo):
-        z = to_cuda(torch.cat([z, demo], dim=1), self.use_cuda)
-        x = self.bn2(F.relu(self.dec1(z)))
-        x = self.dec2(x)
         return x
 class DemoVAE(BaseEstimator):
@@ -106,16 +122,77 @@ class DemoVAE(BaseEstimator):
         return train_losses, val_losses
     def transform(self, x, demo, demo_types):
-        if isinstance(x, int):
-            z = self.vae.gen(x)
-        else:
-            z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
-        demo_t = demo_to_torch(demo, demo_types, self.pred_stats, self.vae.use_cuda)
-        y = self.vae.dec(z, demo_t)
         return to_numpy(y)
     def get_latents(self, x):
-        z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
         return to_numpy(z)
     def save(self, path):

         self.bn2 = to_cuda(nn.BatchNorm1d(1000), use_cuda)
     def enc(self, x):
+        # First layer with activation
+        h = self.enc1(x)
+        h = F.relu(h)
+        # Apply batch norm - handle training vs eval mode automatically
+        h = self.bn1(h)
+        # Output layer
+        z = self.enc2(h)
         return z
     def gen(self, n):
         return to_cuda(torch.randn(n, self.latent_dim).float(), self.use_cuda)
     def dec(self, z, demo):
+        # Concatenate latent code with demographic data
+        z_combined = to_cuda(torch.cat([z, demo], dim=1), self.use_cuda)
+        # First decoder layer with activation
+        h = self.dec1(z_combined)
+        h = F.relu(h)
+        # Apply batch norm - handle training vs eval mode automatically
+        h = self.bn2(h)
+        # Output layer
+        x = self.dec2(h)
         return x
 class DemoVAE(BaseEstimator):
         return train_losses, val_losses
     def transform(self, x, demo, demo_types):
+        # Set model to evaluation mode to handle batch norm with batch size of 1
+        self.vae.eval()
+        # Use torch.no_grad to disable gradient calculation during inference
+        with torch.no_grad():
+            if isinstance(x, int):
+                z = self.vae.gen(x)
+            else:
+                z = self.vae.enc(to_cuda(to_torch(x), self.vae.use_cuda))
+            demo_t = demo_to_torch(demo, demo_types, self.pred_stats, self.vae.use_cuda)
+            # Handle batch size of 1 for batch normalization
+            if z.size(0) == 1:
+                # If batch size is 1, we need to be careful with batch norm
+                # Clone and repeat the input to create a fake batch if needed
+                if hasattr(self.vae, 'bn1') or hasattr(self.vae, 'bn2'):
+                    try:
+                        # Try normal decoding first
+                        y = self.vae.dec(z, demo_t)
+                    except Exception as e:
+                        # If it fails, use a workaround for batch norm
+                        print(f"Using batch norm workaround for inference: {e}")
+                        # Create a batch by repeating the input
+                        z_batch = z.repeat(2, 1)
+                        demo_t_batch = demo_t.repeat(2, 1)
+                        # Get the output and use only the first element
+                        y_batch = self.vae.dec(z_batch, demo_t_batch)
+                        y = y_batch[0:1]
+                else:
+                    # No batch norm, proceed normally
+                    y = self.vae.dec(z, demo_t)
+            else:
+                # Normal batch size, proceed as usual
+                y = self.vae.dec(z, demo_t)
         return to_numpy(y)
     def get_latents(self, x):
+        # Set model to evaluation mode
+        self.vae.eval()
+        # Use torch.no_grad for inference
+        with torch.no_grad():
+            try:
+                # Convert to torch tensor and move to CUDA if needed
+                x_tensor = to_cuda(to_torch(x), self.vae.use_cuda)
+                # Get latent representation
+                z = self.vae.enc(x_tensor)
+            except Exception as e:
+                print(f"Error in encoder: {e}")
+                # Try workaround for batch norm if needed
+                if x.shape[0] == 1 and (hasattr(self.vae, 'bn1') or hasattr(self.vae, 'bn2')):
+                    print("Using batch normalization workaround for single sample")
+                    # Repeat the input to create a batch of size 2
+                    if len(x.shape) == 2:
+                        x_batch = np.repeat(x, 2, axis=0)
+                    else:
+                        x_batch = np.array([x[0], x[0]])
+                    # Process the batch
+                    x_tensor = to_cuda(to_torch(x_batch), self.vae.use_cuda)
+                    z_batch = self.vae.enc(x_tensor)
+                    # Extract just the first sample's latent representation
+                    z = z_batch[0:1]
+                else:
+                    # Re-raise if we can't handle it
+                    raise
         return to_numpy(z)
     def save(self, path):