|
|
""" |
|
|
Gradio UI для предсказания экзопланет |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import joblib |
|
|
import os |
|
|
import time |
|
|
from mapping import ColumnMapper, load_training_columns |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
TRAINING_CSV_PATH = "cumulative_2025.10.03_08.34.41.csv" |
|
|
MODEL_PATH = "exoplanet_detector.joblib" |
|
|
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY", "") |
|
|
|
|
|
|
|
|
model = joblib.load(MODEL_PATH) |
|
|
training_columns = load_training_columns(TRAINING_CSV_PATH) |
|
|
|
|
|
|
|
|
mapper = ColumnMapper(api_key=TOGETHER_API_KEY) |
|
|
|
|
|
|
|
|
def predict_exoplanets(uploaded_file): |
|
|
""" |
|
|
Process uploaded file and return predictions |
|
|
|
|
|
Args: |
|
|
uploaded_file: Uploaded CSV file |
|
|
|
|
|
Returns: |
|
|
Tuple (results, mapping info, statistics) |
|
|
""" |
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
|
|
|
if uploaded_file is None: |
|
|
return None, "Error: Please upload a CSV file", None |
|
|
|
|
|
|
|
|
df_uploaded = pd.read_csv(uploaded_file.name, comment='#') |
|
|
|
|
|
info_msg = f"Loaded rows: {len(df_uploaded)}\n" |
|
|
info_msg += f"Columns in uploaded dataset: {len(df_uploaded.columns)}\n\n" |
|
|
|
|
|
|
|
|
mapping_start = time.time() |
|
|
info_msg += "Performing column mapping via Llama...\n\n" |
|
|
|
|
|
df_mapped, mapping, mapping_info = mapper.map_dataset(df_uploaded, training_columns) |
|
|
|
|
|
mapping_time = time.time() - mapping_start |
|
|
info_msg += mapping_info + "\n" |
|
|
info_msg += f"Mapping time: {mapping_time:.2f} sec\n\n" |
|
|
|
|
|
|
|
|
try: |
|
|
expected_features = list(model.feature_names_in_) |
|
|
info_msg += f"Model expects {len(expected_features)} features\n\n" |
|
|
except AttributeError: |
|
|
|
|
|
target_cols = ['koi_disposition', 'koi_pdisposition'] |
|
|
expected_features = [col for col in training_columns if col not in target_cols] |
|
|
info_msg += f"Using {len(expected_features)} features from training dataset\n\n" |
|
|
|
|
|
|
|
|
info_msg += f"Creating DataFrame with {len(expected_features)} columns...\n" |
|
|
|
|
|
|
|
|
X = pd.DataFrame(index=df_mapped.index, columns=expected_features) |
|
|
|
|
|
|
|
|
for col in expected_features: |
|
|
if col in df_mapped.columns: |
|
|
X[col] = df_mapped[col].values |
|
|
else: |
|
|
X[col] = 0.0 |
|
|
|
|
|
|
|
|
X = X.apply(pd.to_numeric, errors='coerce') |
|
|
|
|
|
|
|
|
available_cols = [col for col in expected_features if col in df_mapped.columns] |
|
|
missing_cols = [col for col in expected_features if col not in df_mapped.columns] |
|
|
|
|
|
if missing_cols: |
|
|
info_msg += f"Warning: {len(missing_cols)} columns missing (filled with zeros)\n" |
|
|
|
|
|
info_msg += f"DEBUG: X.shape = {X.shape}, expected: ({len(df_mapped)}, {len(expected_features)})\n" |
|
|
|
|
|
|
|
|
X = X.fillna(X.mean().fillna(0)) |
|
|
|
|
|
info_msg += f"DEBUG: After fillna X.shape = {X.shape}\n" |
|
|
|
|
|
info_msg += f"Data processing: {X.shape}\n" |
|
|
info_msg += f" Filled: {len(available_cols)} columns, Added zeros: {len(missing_cols)}\n" |
|
|
info_msg += f"Data prepared for model\n\n" |
|
|
|
|
|
|
|
|
pred_start = time.time() |
|
|
|
|
|
|
|
|
X_values = X.values |
|
|
|
|
|
info_msg += f"DEBUG: X_values.shape = {X_values.shape}\n\n" |
|
|
|
|
|
predictions = model.predict(X_values) |
|
|
predictions_proba = model.predict_proba(X_values) |
|
|
pred_time = time.time() - pred_start |
|
|
|
|
|
info_msg += f"Predictions completed: {len(predictions)} objects in {pred_time:.2f} sec\n" |
|
|
|
|
|
|
|
|
df_result = df_uploaded.copy() |
|
|
|
|
|
|
|
|
classes = model.classes_ |
|
|
info_msg += f" Found classes: {list(classes)}\n\n" |
|
|
|
|
|
|
|
|
df_result['prediction'] = predictions |
|
|
|
|
|
|
|
|
for i, class_name in enumerate(classes): |
|
|
df_result[f'confidence_{class_name.replace(" ", "_").lower()}'] = predictions_proba[:, i] |
|
|
|
|
|
|
|
|
if mapping: |
|
|
for src_col, tgt_col in mapping.items(): |
|
|
if src_col in df_uploaded.columns and tgt_col in df_mapped.columns: |
|
|
df_result[f'mapped_as_{tgt_col}'] = df_uploaded[src_col] |
|
|
|
|
|
|
|
|
|
|
|
display_columns = ['prediction'] |
|
|
for class_name in classes: |
|
|
col_name = f'confidence_{class_name.replace(" ", "_").lower()}' |
|
|
if col_name in df_result.columns: |
|
|
display_columns.append(col_name) |
|
|
|
|
|
|
|
|
mapped_cols = [col for col in df_result.columns if col.startswith('mapped_as_')] |
|
|
display_columns.extend(mapped_cols[:10]) |
|
|
|
|
|
|
|
|
if not mapped_cols and len(df_uploaded.columns) > 0: |
|
|
original_cols = [col for col in df_uploaded.columns[:5] if col in df_result.columns] |
|
|
display_columns.extend(original_cols) |
|
|
|
|
|
|
|
|
df_display = df_result[display_columns].copy() |
|
|
|
|
|
total_time = time.time() - start_time |
|
|
|
|
|
|
|
|
from collections import Counter |
|
|
pred_counts = Counter(predictions) |
|
|
|
|
|
stats_lines = ["**Prediction Statistics:**\n"] |
|
|
stats_lines.append(f"* Total objects: {len(predictions)}\n") |
|
|
|
|
|
for class_name in classes: |
|
|
count = pred_counts.get(class_name, 0) |
|
|
pct = count / len(predictions) * 100 if len(predictions) > 0 else 0 |
|
|
stats_lines.append(f"* {class_name}: {count} ({pct:.1f}%)\n") |
|
|
|
|
|
stats_lines.append(f"\n**Processing time:** {total_time:.2f} seconds\n") |
|
|
stats_lines.append(f"\n**Columns in result:**\n") |
|
|
stats_lines.append(f"* All original columns from uploaded file (with original names)\n") |
|
|
stats_lines.append(f"* `prediction`: Predicted class ({', '.join(classes)})\n") |
|
|
|
|
|
for class_name in classes: |
|
|
col_name = f'confidence_{class_name.replace(" ", "_").lower()}' |
|
|
stats_lines.append(f"* `{col_name}`: Probability of class {class_name}\n") |
|
|
|
|
|
stats_lines.append(f"* Columns `mapped_as_*`: Duplicate mapped columns for reference\n") |
|
|
stats_lines.append(f"\n**Total columns in result:** {len(df_result.columns)}\n") |
|
|
|
|
|
stats = "".join(stats_lines) + f""" |
|
|
|
|
|
**Mapping completed:** {len(mapping)} columns renamed for model |
|
|
|
|
|
**Full dataset saved:** All {len(df_result.columns)} columns available for download |
|
|
""" |
|
|
|
|
|
|
|
|
output_file = "predictions_result.csv" |
|
|
df_result.to_csv(output_file, index=False) |
|
|
|
|
|
|
|
|
return df_display, info_msg, stats, output_file |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Error processing file:\n{str(e)}" |
|
|
import traceback |
|
|
error_msg += f"\n\n{traceback.format_exc()}" |
|
|
return None, error_msg, None, None |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Exoplanet Detector", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# Exoplanet Detector |
|
|
|
|
|
Upload a CSV file with data about exoplanet candidates (KOI - Kepler Objects of Interest). |
|
|
|
|
|
**How it works:** |
|
|
1. Upload your dataset with any column structure |
|
|
2. Llama automatically maps your columns to training columns |
|
|
3. Model makes predictions: exoplanet or false positive |
|
|
|
|
|
**Model:** Random Forest Classifier |
|
|
**Mapping:** Llama 3.3 70B via Together AI |
|
|
|
|
|
**Note:** Processing large datasets (>1000 rows) may take several minutes. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
file_input = gr.File( |
|
|
label="Upload CSV file", |
|
|
file_types=[".csv"], |
|
|
type="filepath" |
|
|
) |
|
|
submit_btn = gr.Button("Run Prediction", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
mapping_info = gr.Textbox( |
|
|
label="Column Mapping Information", |
|
|
lines=15, |
|
|
max_lines=20 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
stats_output = gr.Markdown(label="Statistics") |
|
|
|
|
|
with gr.Row(): |
|
|
results_output = gr.Dataframe( |
|
|
label="Prediction Results (main columns)", |
|
|
wrap=True, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
download_output = gr.File( |
|
|
label="Download full result with all columns", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=predict_exoplanets, |
|
|
inputs=[file_input], |
|
|
outputs=[results_output, mapping_info, stats_output, download_output] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### Tips: |
|
|
- Make sure your CSV file contains data about stellar systems and their characteristics |
|
|
- The more columns match the training dataset, the more accurate the predictions will be |
|
|
- Model trained on NASA Exoplanet Archive data (Kepler Mission) |
|
|
|
|
|
### Example training dataset columns: |
|
|
`koi_period`, `koi_depth`, `koi_prad`, `koi_teq`, `koi_insol`, `koi_steff`, `koi_slogg`, `koi_srad`, `ra`, `dec`, `koi_kepmag` etc. |
|
|
""") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(share=False, server_name="0.0.0.0", server_port=7860) |
|
|
|