#!/usr/bin/env python
# coding: utf-8

# In[1]:


pip install --user torch faiss-cpu transformers


# In[2]:


import json
import pandas as pd
from pathlib import Path

# === CORRECT PATHS ===
TRAIN_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\train.txt.json")
TEST_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\test.json")
KNOWLEDGE_BASE_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\7.json")

# === BASE FUNCTIONS ===

def load_jsonl(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f] if filepath.suffix == ".jsonl" else json.load(f)

def extract_claims(dataset):
    return pd.DataFrame([
        {
            "claim_id": item.get("claim_id", idx),
            "claim": item["claim"],
            "speaker": item.get("speaker"),
            "claim_date": item.get("claim_date"),
            "source": item.get("reporting_source"),
        }
        for idx, item in enumerate(dataset)
    ])

def load_knowledge_base_jsonl(path):
    kb = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                try:
                    kb.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"⚠️ Error parsing line: {e}")
    return {entry["claim_id"]: entry for entry in kb if "claim_id" in entry}

# === EXECUTION ===

# Load training and test data
train_data = load_jsonl(TRAIN_PATH)
test_data = load_jsonl(TEST_PATH)

print(f" Train samples: {len(train_data)}")
print(f" Test samples: {len(test_data)}")

# Load claims from test set
test_claims_df = extract_claims(test_data)
display(test_claims_df.head())

# Load knowledge base
if KNOWLEDGE_BASE_PATH.exists():
    knowledge_base = load_knowledge_base_jsonl(KNOWLEDGE_BASE_PATH)
    print(f" Knowledge base loaded. Claim IDs: {list(knowledge_base.keys())}")


# In[3]:


import torch
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel

# Loading sentence transformer
MODEL_NAME = 'sentence-transformers/multi-qa-mpnet-base-dot-v1' #Modello ottimizzato per creare embedding di testi da confrontare con il prodotto scalare come similarity score
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Translate model on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# In[6]:


# averitec_model_loader.py

import torch
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel

def load_model(model_name='sentence-transformers/multi-qa-mpnet-base-dot-v1'):
    """
    Carica tokenizer e modello Transformers per sentence embeddings.
    Usa MPNET ottimizzato per retrieval multi-dominio.
    """
    print(f"📦 Caricamento modello: {model_name} ...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    print(f"✅ Modello '{model_name}' caricato su {device}")
    return tokenizer, model, device


# In[12]:


# STEP 1 - FAISS index creation (no terminal needed)

import json
import numpy as np
import faiss
from pathlib import Path
from sentence_transformers import SentenceTransformer

# === CONFIGURATION ===
MODEL_NAME = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
KNOWLEDGE_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\7.json")
TOP_K = 3

# === Load the SentenceTransformer model ===
print(f" Loading model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME, device='cpu')



# === Load and parse the knowledge base ===
documents = []
sources = []

with open(KNOWLEDGE_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            try:
                entry = json.loads(line)
                url2text = entry.get("url2text", [])
                if isinstance(url2text, list) and url2text:
                    full_text = " ".join(url2text).strip()
                    if full_text:
                        documents.append(full_text)
                        sources.append(entry.get("url", ""))
            except json.JSONDecodeError as e:
                print(f" JSON error in line: {e}")

print(f" Total documents indexed: {len(documents)}")

# === Compute embeddings ===
print(" Encoding documents...")
doc_embeddings = model.encode(documents, convert_to_numpy=True, normalize_embeddings=True)

# === Create FAISS index ===
index = faiss.IndexFlatIP(doc_embeddings.shape[1])
index.add(doc_embeddings)
print(" FAISS index ready!")

# === Store results in memory (optional) ===
faiss_index = index
faiss_docs = documents
faiss_sources = sources


# In[ ]:


import torch
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
from pathlib import Path
import faiss
import pandas as pd

# === Configuration ===
CLAIM = "Trump paid only $750 in federal income taxes in 2017."  # claim to verify
TOP_K = 3  # Number of documents to retrieve (can be 1 in this case)

# === Load single knowledge base entry ===
with open("7.json", "r", encoding="utf-8") as f:
    entry = json.load(f)

documents = [" ".join(entry["url2text"]).strip()]
sources = [entry["url"]]

# === Embedding and FAISS index creation ===
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
doc_embeddings = model.encode(documents, convert_to_numpy=True, normalize_embeddings=True)

# Create FAISS index in memory
index = faiss.IndexFlatIP(doc_embeddings.shape[1])
index.add(doc_embeddings)

# === Encode claim and retrieve top documents ===
claim_embedding = model.encode(CLAIM, convert_to_numpy=True, normalize_embeddings=True)
scores, indices = index.search(np.expand_dims(claim_embedding, axis=0), TOP_K)

# === Retrieve most relevant documents ===
evidence = []
for idx in indices[0]:
    evidence.append({
        "question": "",  # optional
        "answer": documents[idx],
        "url": sources[idx],
        "scraped_text": documents[idx]
    })

# === Simple classification using pre-trained model ===
from transformers import BartForSequenceClassification, BartTokenizer

# Load NLI model
nli_model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
nli_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")

# Concatenate claim with evidence
joined_evidence = " ".join([e["answer"] for e in evidence])

# Tokenize for NLI
inputs = nli_tokenizer(claim, joined_evidence, return_tensors="pt", truncation=True, padding=True)

with torch.no_grad():
    logits = nli_model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=-1).item()

# Map class labels
label_map = {0: "Contradicted", 1: "Neutral", 2: "Supported"}
predicted_label = label_map[predicted_class]

# === Final result ===
result = {
    "claim": CLAIM,
    "pred_label": predicted_label,
    "evidence": evidence
}

print(json.dumps(result, indent=2, ensure_ascii=False))


# In[ ]:


# STEP 4
import subprocess
import json

# Function to generate a question using DeepSeek R1:8B
def generate_question_with_deepseek(claim):
    # Prepare the command to call the Ollama API
    command = [
        "ollama", "generate", 
        "--model", "deepseek-r1:8b-llama-distill-q8_0",  # Specified model
        "--input", claim,  # Claim to be turned into a question
        "--output", "json"  # Output format
    ]
    
    # Run the command and capture the output
    result = subprocess.run(command, capture_output=True, text=True)
    
    # Check if there was an error during execution
    if result.returncode != 0:
        raise Exception(f"Error during model inference: {result.stderr}")
    
    # Process the JSON output
    response = json.loads(result.stdout)
    
    # Extract the generated question
    question = response.get('text', '')
    return question

# Example claim based on the knowledge base
claim = "Donald Trump paid only $750 in federal income taxes in 2017."
question = generate_question_with_deepseek(claim)
print(f"Generated question: {question}")



# In[ ]:


pip install transformers torch


# In[ ]:


import torch
from transformers import BartForSequenceClassification, BartTokenizer
import numpy as np

# Load the fine-tuned BART model and tokenizer
model_name = "facebook/bart-large-mnli"  # You can replace this with your fine-tuned model
model = BartForSequenceClassification.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Function to classify a claim using the BART model
def classify_claim_with_bart(claim, evidence_text):
    # Prepare the input by concatenating the claim and the evidence (if any)
    input_text = claim + " " + evidence_text  # You can use your logic to handle the evidence
    
    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    
    # Run inference
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Compute the prediction: the model will return probabilities for each class
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    # Classifier labels (for example, Supported, Refuted, Insufficient Evidence)
    label_map = {0: "Supported", 1: "Refuted", 2: "Insufficient Evidence"}  # Customize according to your model's classes
    
    # Return the predicted label
    return label_map[predicted_class]

# Example usage with claim and evidence
claim = "Donald Trump paid only $750 in federal income taxes in 2017."
evidence_text = "Various detailed reports show the figures and the breakdown of the $750 paid in federal income taxes in 2017, as provided by Trump's tax return data."

# Classify the claim
predicted_label = classify_claim_with_bart(claim, evidence_text)
print(f"Predicted label: {predicted_label}")



# In[ ]:


import torch 
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
from pathlib import Path
import faiss
import pandas as pd
from transformers import BartForSequenceClassification, BartTokenizer

# === Configurations ===
TRAIN_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\train.txt.json")
OUTPUT_PATH = Path("submission_dev.csv")
TOP_K = 5  # Number of documents to retrieve

# === Initialize the models ===
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Example model for sentence embeddings
# Load the fine-tuned BART model and tokenizer
bart_model_name = "facebook/bart-large-mnli"  # BART fine-tuned model for classification
bart_model = BartForSequenceClassification.from_pretrained(bart_model_name)
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)

# === Load the training data ===
with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# === Function to classify claims with BART ===
def classify_claim_with_bart(claim, evidence_text):
    # Prepare the input by concatenating the claim and the evidence (if present)
    input_text = claim + " " + evidence_text  # You may adjust logic to handle the evidence properly
    
    # Tokenize the input
    inputs = bart_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    
    # Run inference
    with torch.no_grad():
        logits = bart_model(**inputs).logits
    
    # Compute the prediction: the model will return probabilities for each class
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    # Classifier labels (for example, Supported, Refuted, Insufficient Evidence)
    label_map = {0: "Supported", 1: "Refuted", 2: "Insufficient Evidence"}  # Customize according to your model's classes
    
    # Return the predicted label
    return label_map[predicted_class]

# === Loop through each item in the dataset ===
results = []
for item in train_data:
    claim = item['claim']
    claim_id = item.get('claim_id', None) or hash(claim)  # fallback ID
    label = None  # Placeholder for the label
    claim_embedding = model.encode(claim, convert_to_numpy=True, normalize_embeddings=True)

    # === Retrieve the top-K documents using FAISS ===
    scores, indices = faiss_index.search(np.expand_dims(claim_embedding, axis=0), TOP_K)
    evidence = []
    for idx in indices[0]:
        evidence.append({
            "question": "",  # Optional if not generating QA
            "answer": faiss_docs[idx],  # FAISS retrieved document
            "url": faiss_sources[idx],  # Corresponding URL
            "scraped_text": faiss_docs[idx]  # Document text
        })

    # === Classify the claim using the BART model ===
    # Join all the evidence together and concatenate with the claim for the input
    joined_evidence = " ".join([e['answer'] for e in evidence])
    label = classify_claim_with_bart(claim, joined_evidence)  # Use BART for final classification

    # Add the results
    results.append({
        "claim_id": claim_id,
        "claim": claim,
        "pred_label": label,
        "evidence": evidence
    })

# === Write results to CSV ===
df = pd.DataFrame(results)
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ File saved: {OUTPUT_PATH}")



# In[ ]:


import torch 
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
from pathlib import Path
import faiss
import pandas as pd
from transformers import BartForSequenceClassification, BartTokenizer

# === Configurations ===
TRAIN_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\train.txt.json")
OUTPUT_PATH = Path("submission_dev.csv")
TOP_K = 5  # Number of documents to retrieve

# === Initialize the models ===
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Example model for sentence embeddings
# Load the fine-tuned BART model and tokenizer
bart_model_name = "facebook/bart-large-mnli"  # BART fine-tuned model for classification
bart_model = BartForSequenceClassification.from_pretrained(bart_model_name)
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)

# === Load the training data ===
with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# === Function to classify claims with BART ===
def classify_claim_with_bart(claim, evidence_text):
    # Prepare the input by concatenating the claim and the evidence (if present)
    input_text = claim + " " + evidence_text  # You may adjust logic to handle the evidence properly
    
    # Tokenize the input
    inputs = bart_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    
    # Run inference
    with torch.no_grad():
        logits = bart_model(**inputs).logits
    
    # Compute the prediction: the model will return probabilities for each class
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    # Classifier labels (for example, Supported, Refuted, Insufficient Evidence, Contradictory)
    label_map = {0: "Supported", 1: "Refuted", 2: "Insufficient Evidence", 3: "Contradictory"}  # Updated with "Contradictory"
    
    # Return the predicted label
    return label_map[predicted_class]

# === Loop through each item in the dataset ===
results = []
for item in train_data:
    claim = item['claim']
    claim_id = item.get('claim_id', None) or hash(claim)  # fallback ID
    label = None  # Placeholder for the label
    claim_embedding = model.encode(claim, convert_to_numpy=True, normalize_embeddings=True)

    # === Retrieve the top-K documents using FAISS ===
    scores, indices = faiss_index.search(np.expand_dims(claim_embedding, axis=0), TOP_K)
    evidence = []
    for idx in indices[0]:
        evidence.append({
            "question": "",  # Optional if not generating QA
            "answer": faiss_docs[idx],  # FAISS retrieved document
            "url": faiss_sources[idx],  # Corresponding URL
            "scraped_text": faiss_docs[idx]  # Document text
        })

    # === Classify the claim using the BART model ===
    # Join all the evidence together and concatenate with the claim for the input
    joined_evidence = " ".join([e['answer'] for e in evidence])
    label = classify_claim_with_bart(claim, joined_evidence)  # Use BART for final classification

    # Add the results
    results.append({
        "claim_id": claim_id,
        "claim": claim,
        "pred_label": label,
        "evidence": evidence
    })

# === Write results to CSV ===
df = pd.DataFrame(results)
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ File saved: {OUTPUT_PATH}")




# In[ ]:


#fine tuning on the model within contradictory label
import torch
from transformers import BartForSequenceClassification, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from pathlib import Path
import json

# === Configurations ===
TRAIN_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\train.txt.json")
OUTPUT_DIR = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\fine_tuned_bart")
BART_MODEL_NAME = "facebook/bart-large-mnli"  # BART pre-addestrato
NUM_LABELS = 4  # Aggiungi "Contradictory"
BATCH_SIZE = 8
EPOCHS = 3

# === Load the dataset ===
# Carica il dataset di addestramento (assumendo che sia in formato JSON)
with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# Crea un dataset in formato compatibile con `datasets` per Hugging Face
def process_data(data):
    return [{'claim': item['claim'], 'label': item['label']} for item in data]

train_data_processed = process_data(train_data)

# === Preprocessing ===
tokenizer = BartTokenizer.from_pretrained(BART_MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples['claim'], padding=True, truncation=True)

# Usa Hugging Face `datasets` per il processamento
from datasets import Dataset
train_dataset = Dataset.from_list(train_data_processed)
train_dataset = train_dataset.map(tokenize_function, batched=True)

# === Prepare model ===
model = BartForSequenceClassification.from_pretrained(BART_MODEL_NAME, num_labels=NUM_LABELS)

# === Training arguments ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",  # Valutazione alla fine di ogni epoca
    logging_dir='./logs',
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# === Fine-tuning ===
trainer.train()

# === Save the fine-tuned model ===
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Fine-tuned model saved at: {OUTPUT_DIR}")


# In[ ]:


#inference on file tuned model
import torch
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from pathlib import Path
import faiss
import pandas as pd
from transformers import BartForSequenceClassification, BartTokenizer

# === Configurations ===
TRAIN_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\train.txt.json")
TEST_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\test.json")
KNOWLEDGE_BASE_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\7.json")
OUTPUT_PATH = Path(r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\submission_dev.csv")
TOP_K = 5  # Number of documents to retrieve

# === Initialize models ===
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Model for sentence embeddings

# Load the fine-tuned BART model and tokenizer
bart_model_path = r"C:\Users\Gaetano\Desktop\DATASET_AVERITEC\fine_tuned_bart"
bart_model = BartForSequenceClassification.from_pretrained(bart_model_path)
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_path)

# === Load the knowledge base ===
with open(KNOWLEDGE_BASE_PATH, 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)

# === Function to classify claims with BART ===
def classify_claim_with_bart(claim, evidence_text):
    input_text = claim + " " + evidence_text  # Concatenate claim and evidence
    inputs = bart_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    
    # Inference
    with torch.no_grad():
        logits = bart_model(**inputs).logits
    
    # Prediction
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    # Map the predicted class to label
    label_map = {0: "Supported", 1: "Refuted", 2: "Insufficient Evidence", 3: "Contradictory"}
    
    return label_map[predicted_class]

# === Function to retrieve evidence using FAISS ===
def retrieve_evidence(claim_embedding, faiss_index, faiss_docs, faiss_sources):
    scores, indices = faiss_index.search(np.expand_dims(claim_embedding, axis=0), TOP_K)
    evidence = []
    for idx in indices[0]:
        evidence.append({
            "question": "",  # Optional if not generating QA
            "answer": faiss_docs[idx],
            "url": faiss_sources[idx],
            "scraped_text": faiss_docs[idx]
        })
    return evidence

# === Main loop to process test claims ===
results = []
for item in knowledge_base:
    claim = item['claim']
    claim_id = item.get('claim_id', None) or hash(claim)  # fallback ID
    label = None
    claim_embedding = model.encode(claim, convert_to_numpy=True, normalize_embeddings=True)

    # === Retrieve evidence from FAISS (assuming faiss_index is already loaded) ===
    evidence = retrieve_evidence(claim_embedding, faiss_index, faiss_docs, faiss_sources)

    # === Classify the claim with the fine-tuned BART model ===
    joined_evidence = " ".join([e['answer'] for e in evidence])
    label = classify_claim_with_bart(claim, joined_evidence)

    # Collect the result
    results.append({
        "claim_id": claim_id,
        "claim": claim,
        "pred_label": label,
        "evidence": evidence
    })

# === Write results to CSV ===
df = pd.DataFrame(results)
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ File saved: {OUTPUT_PATH}")


# In[ ]:


#Adding similarity threshold 
import torch
import numpy as np
import faiss
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    pipeline
)
from sentence_transformers import CrossEncoder
from typing import List, Dict

# Configurations
SIMILARITY_THRESHOLD = 0.6  # Threshold to avoid less relevant proof
NLI_THRESHOLD = 0.8  # Threshold to take into account a contraddiction
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initizialization of the models
print("Loading models...")

# 1. Model for embedding (FAISS)
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=DEVICE)

# 2. Model NLI for contraddiction
nli_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
nli_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli").to(DEVICE)

# 3. Cross-Encoder for reranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=DEVICE)

# Function to filter for similarity
def filter_by_similarity(claim: str, documents: List[str], sources: List[str], threshold: float = SIMILARITY_THRESHOLD) -> List[Dict]:
    """Wilter documents under similarity threshold with FAISS."""
    # Compute embedding
    claim_embedding = embedding_model.encode(claim, convert_to_tensor=True)
    doc_embeddings = embedding_model.encode(documents, convert_to_tensor=True)
    
    # Create FAISS index
    index = faiss.IndexFlatIP(claim_embedding.shape[0])
    index.add(doc_embeddings.cpu().numpy())
    
    # Search top-k
    scores, indices = index.search(claim_embedding.cpu().numpy().reshape(1, -1), len(documents))
    
    # Filter for threshold
    filtered = []
    for idx, score in zip(indices[0], scores[0]):
        if score > threshold:
            filtered.append({
                "text": documents[idx],
                "url": sources[idx],
                "score": float(score)
            })
    
    return filtered

# Function NLI to find contraddiction
def detect_contradiction(claim: str, evidence_text: str) -> bool:
    """Usa RoBERTa-large-MNLI per rilevare se l'evidenza contraddice il claim."""
    inputs = nli_tokenizer(claim, evidence_text, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
    with torch.no_grad():
        outputs = nli_model(**inputs)
    
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    contradiction_prob = probs[0][2].item()  # Class 2 = contraddiction
    
    return contradiction_prob > NLI_THRESHOLD

# Function for reranking with Cross-Encoder
def rerank_evidence(claim: str, evidence_list: List[Dict]) -> List[Dict]:
    """Riordina le prove per rilevanza usando Cross-Encoder."""
    if not evidence_list:
        return []
    
    # Prepare couples (claim, evidence)
    pairs = [(claim, e["text"]) for e in evidence_list]
    
    # Compute scores
    scores = cross_encoder.predict(pairs)
    
    # Update documents with new scores
    for doc, score in zip(evidence_list, scores):
        doc["rerank_score"] = float(score)
    
    # Sort for decreasing score
    return sorted(evidence_list, key=lambda x: x["rerank_score"], reverse=True)

# Pipeline completed
def verify_claim(claim: str, documents: List[str], sources: List[str]) -> Dict:
    """Pipeline completa: Filtraggio -> Reranking -> Analisi contraddizioni."""
    # 1. Filter for similarity
    filtered = filter_by_similarity(claim, documents, sources)
    
    # 2. Reranking
    reranked = rerank_evidence(claim, filtered)
    
    # 3.Analyse contraddiction
    supported = []
    refuted = []
    
    for doc in reranked:
        if detect_contradiction(claim, doc["text"]):
            refuted.append(doc)
        else:
            supported.append(doc)
    
    return {
        "claim": claim,
        "supported_evidence": supported[:3],  # Top 3 support evidence
        "refuted_evidence": refuted[:3],      # Top 3 evidence to the contrary
        "conflicting": len(supported) > 0 and len(refuted) > 0
    }

# Usage 
if __name__ == "__main__":
    claim = "Donald Trump paid only $750 in federal income taxes in 2017."
    
    # Documents
    example_docs = [
        "Trump's tax returns show he paid $750 in federal income taxes in 2017.",
        "Financial records indicate Trump paid over $1 million in taxes in 2017.",
        "The New York Times reported Trump paid only $750 in 2016 and 2017."
    ]
    example_sources = [
        "https://example.com/source1",
        "https://example.com/source2",
        "https://example.com/source3"
    ]
    
    # Print phase
    result = verify_claim(claim, example_docs, example_sources)
    
    print("\n🔍 Risultati:")
    print(f"Claim: {result['claim']}")
    print(f"Prove a supporto: {len(result['supported_evidence']}")
    print(f"Prove a confutazione: {len(result['refuted_evidence']}")
    print(f"Evidenza contrastante: {'Sì' if result['conflicting'] else 'No'}")
    
    # Print first evidences
    print("\nTOP PROVE A SUPPORTO:")
    for idx, doc in enumerate(result['supported_evidence'][:3]):
        print(f"{idx+1}. Score: {doc['rerank_score']:.3f}")
        print(f"   Text: {doc['text'][:200]}...")
        print(f"   URL: {doc['url']}\n")


# In[ ]:


#Generating questions 
"""
AVERITEC EVIDENCE QA GENERATOR
This script generates questions from claims and extracts answers from evidence documents
using state-of-the-art QA models.
"""

from transformers import pipeline
from typing import List, Dict
import torch

class EvidenceQA:
    """
    A class for generating questions from claims and extracting answers from evidence.
    """
    
    def __init__(self):
        """
        Initialize QA models with optimal parameters.
        """
        print("⚙️ Loading QA models...")
        
        # Device configuration
        self.device = 0 if torch.cuda.is_available() else -1
        
        # Model for question generation (T5-based)
        self.qg_model = pipeline(
            "text2text-generation",
            model="valhalla/t5-small-qg-hl",
            device=self.device
        )
        
        # Model for answer extraction (RoBERTa-based)
        self.qa_model = pipeline(
            "question-answering",
            model="deepset/roberta-base-squad2",
            device=self.device
        )
    
    def generate_question(self, claim: str) -> str:
        """
        Generate a natural question from a claim using QG model.
        
        Args:
            claim: The factual claim to be verified
            
        Returns:
            A natural language question that could verify the claim
        """
        # Prepare prompt for question generation
        prompt = f"generate question: {claim}"
        
        # Generate question
        result = self.qg_model(
            prompt,
            max_length=64,
            num_beams=4,
            early_stopping=True
        )
        
        # Post-process generated question
        question = result[0]['generated_text'].strip()
        if not question.endswith('?'):
            question += '?'
            
        return question
    
    def extract_answer(self, question: str, context: str) -> Dict:
        """
        Extract answer from evidence text given a question.
        
        Args:
            question: The generated question
            context: Evidence text to search for answers
            
        Returns:
            Dictionary containing answer text and confidence score
        """
        # Handle empty context
        if not context.strip():
            return {
                "answer": "",
                "score": 0.0,
                "start": 0,
                "end": 0
            }
        
        # Get answer from QA model
        result = self.qa_model(
            question=question,
            context=context,
            top_k=1,  # Get only the best answer
            handle_impossible_answer=True
        )
        
        return {
            "answer": result['answer'],
            "score": float(result['score']),
            "start": result['start'],
            "end": result['end']
        }
    
    def generate_qa_pairs(self, claim: str, evidence_texts: List[str]) -> List[Dict]:
        """
        Generate complete QA pairs for a claim and multiple evidence texts.
        
        Args:
            claim: The claim to verify
            evidence_texts: List of evidence documents
            
        Returns:
            List of QA pairs with scores and evidence
        """
        # Generate question from claim
        question = self.generate_question(claim)
        print(f"🔍 Generated question: {question}")
        
        qa_pairs = []
        
        # Extract answers from each evidence text
        for text in evidence_texts:
            answer = self.extract_answer(question, text)
            
            qa_pairs.append({
                "question": question,
                "answer": answer['answer'],
                "confidence": answer['score'],
                "evidence": text[:300] + "..." if len(text) > 300 else text,
                "answer_start": answer['start'],
                "answer_end": answer['end']
            })
        
        # Sort by answer confidence (descending)
        return sorted(qa_pairs, key=lambda x: x['confidence'], reverse=True)


# Example Usage
if __name__ == "__main__":
    # Initialize QA generator
    qa_generator = EvidenceQA()
    
    # Sample claim and evidence
    test_claim = "Donald Trump paid only $750 in federal income taxes in 2017."
    
    test_evidence = [
        "According to tax returns obtained by the New York Times, Donald Trump paid $750 in federal income taxes in 2017. The documents show this was his total federal tax payment for that year.",
        "Financial records from 2017 indicate Trump paid approximately $1.2 million in various taxes, including $750 in federal income tax and other state and local taxes.",
        "The IRS has no public records showing Trump's exact 2017 tax payments, but journalists estimate it was around $750 based on confidential sources.",
        "In 2017, Donald Trump claimed large business losses that reduced his taxable income, resulting in only $750 of federal income tax due."
    ]
    
    # Generate QA pairs
    results = qa_generator.generate_qa_pairs(test_claim, test_evidence)
    
    # Print results
    print("\n📊 EVIDENCE QA RESULTS")
    print(f"Original claim: {test_claim}")
    
    for i, pair in enumerate(results[:3]):  # Show top 3 results
        print(f"\n🔍 QA Pair {i+1}:")
        print(f"Question: {pair['question']}")
        print(f"Answer: {pair['answer']}")
        print(f"Confidence: {pair['confidence']:.3f}")
        print(f"Evidence: {pair['evidence']}")


# In[ ]:


#Convert in required format
import csv
import json

# Function to generate questions and answers
def generate_question_answer(claim):
    # Create a question based on the claim
    question = f"Is the following statement true or false: {claim}"
    
    # Assume the answer is obtained from a knowledge base (simulated in this example)
    # Here we use a simple example answer, but you should replace it with an actual evidence extraction
    answer = "SUPPORTED"  # Classification result, e.g., 'SUPPORTED', 'REFUTED', etc.
    
    return question, answer

# Function to generate the CSV output in the required format
def generate_csv_output(claim_data, filename="output.csv"):
    with open(filename, mode="w", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["claim_id", "claim", "pred_label", "evidence"])
        writer.writeheader()
        
        for claim in claim_data:
            claim_id = claim.get("claim_id")
            claim_text = claim.get("claim")
            pred_label = claim.get("pred_label")
            url = claim.get("url")
            scraped_text = claim.get("url2text", [])[0]  # Assume the first text is the "scraped_text"
            
            # Generate the question and answer
            question, answer = generate_question_answer(claim_text)
            
            # Format the evidence
            evidence = [{
                "question": question,
                "answer": answer,
                "url": url,
                "scraped_text": scraped_text
            }]
            
            # Write the data to the CSV
            writer.writerow({
                "claim_id": claim_id,
                "claim": claim_text,
                "pred_label": pred_label,
                "evidence": json.dumps(evidence)  # Convert evidence to JSON format
            })

# Example data (you can replace this with your real data)
claim_data = [
    {
        "claim_id": "7",
        "claim": "Trump paid only $750 in federal income taxes in 2017.",
        "pred_label": "SUPPORTED",  # Classification prediction
        "url": "https://web.archive.org/web/20210212182804/https://www.nytimes.com/2020/09/29/us/trump-750-taxes.html",
        "url2text": [
            "Trump Paid $750 in Federal Income Taxes in 2017. Here’s the Math.",
            "Figures drawn from President Trump’s tax-return data show how that number was calculated.",
            "The small amount of federal income taxes President Trump paid in both 2016 and 2017 — just $750 each year — has become the focus of much attention since it was revealed in a New York Times investigation.",
            "..."
        ]
    }
]

# Run the function to generate the CSV
generate_csv_output(claim_data)


# In[ ]:


#optimization of the model for gpu and respecting time limits 
"""
AVERITEC GPU-OPTIMIZED VERIFICATION PIPELINE
This script implements a time-constrained claim verification system optimized for:
- Maximum GPU utilization
- Strict 1-minute time limit per claim (AVeriTeC requirement)
- Efficient batch processing
"""

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
    AutoModelForQuestionAnswering
)
from sentence_transformers import CrossEncoder, SentenceTransformer
from typing import List, Dict, Optional
import time
import logging
from concurrent.futures import ThreadPoolExecutor

# Configuration
MAX_TIME_PER_CLAIM = 55  # Seconds (5s buffer under 1-minute limit)
BATCH_SIZE = 8  # Optimal for A10G GPU memory
LOG_LEVEL = logging.INFO

# Initialize logging
logging.basicConfig(
    level=LOG_LEVEL,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class OptimizedClaimVerifier:
    """
    GPU-optimized claim verifier with strict time constraints.
    """

    def __init__(self):
        """Initialize models with GPU optimization."""
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"🚀 Initializing models on {self.device}")
        
        # Enable mixed precision for FP16 inference
        self.amp_enabled = True if self.device.type == "cuda" else False
        
        # Load models with optimal settings
        self._load_models()
        
        # Warmup models
        self._warmup()
    
    def _load_models(self):
        """Load models with GPU-specific optimizations."""
        start_time = time.time()
        
        # 1. Embedding model with half-precision
        self.embedding_model = SentenceTransformer(
            'multi-qa-mpnet-base-dot-v1',
            device=self.device
        )
        if self.amp_enabled:
            self.embedding_model = self.embedding_model.half()
        
        # 2. NLI model with torchscript optimization
        self.nli_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
        self.nli_model = AutoModelForSequenceClassification.from_pretrained(
            "roberta-large-mnli",
            torchscript=True
        ).to(self.device)
        self.nli_model.eval()
        
        # 3. Cross-encoder with dynamic batching
        self.cross_encoder = CrossEncoder(
            'cross-encoder/ms-marco-MiniLM-L-6-v2',
            device=str(self.device),
            max_length=512
        )
        
        # 4. QA model for evidence extraction
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained(
            "deepset/roberta-base-squad2"
        ).to(self.device)
        self.qa_tokenizer = AutoTokenizer.from_pretrained(
            "deepset/roberta-base-squad2"
        )
        
        logger.info(f"⏱ Models loaded in {time.time() - start_time:.2f}s")
    
    def _warmup(self):
        """Run warmup inferences to optimize GPU performance."""
        logger.info("🔥 Warming up models...")
        warmup_text = "This is a warmup run."
        
        with torch.no_grad():
            # Warmup embedding model
            self.embedding_model.encode([warmup_text]*2)
            
            # Warmup NLI model
            inputs = self.nli_tokenizer(
                warmup_text,
                warmup_text,
                return_tensors="pt"
            ).to(self.device)
            self.nli_model(**inputs)
            
            # Warmup QA model
            qa_inputs = self.qa_tokenizer(
                warmup_text,
                warmup_text,
                return_tensors="pt"
            ).to(self.device)
            self.qa_model(**qa_inputs)
    
    def _enforce_time_limit(self, start_time: float):
        """Check if processing is within time limits."""
        elapsed = time.time() - start_time
        if elapsed > MAX_TIME_PER_CLAIM:
            raise TimeoutError(
                f"⏰ Time limit exceeded: {elapsed:.2f}s > {MAX_TIME_PER_CLAIM}s"
            )
    
    def process_claim(
        self,
        claim: str,
        documents: List[str],
        sources: List[str]
    ) -> Optional[Dict]:
        """
        Process a single claim within strict time constraints.
        
        Args:
            claim: The claim text to verify
            documents: List of evidence documents
            sources: List of source URLs
            
        Returns:
            Verification results or None if timeout
        """
        start_time = time.time()
        results = {
            "claim": claim,
            "error": None,
            "verdict": None,
            "evidence": []
        }
        
        try:
            # Phase 1: Evidence Retrieval (20s budget)
            with torch.cuda.amp.autocast(enabled=self.amp_enabled):
                doc_embeddings = self.embedding_model.encode(
                    documents,
                    batch_size=BATCH_SIZE,
                    convert_to_tensor=True
                )
                claim_embedding = self.embedding_model.encode(
                    claim,
                    convert_to_tensor=True
                )
            
            # Phase 2: Reranking (15s budget)
            pairs = [(claim, doc) for doc in documents]
            with ThreadPoolExecutor() as executor:
                rerank_scores = list(executor.map(
                    lambda p: self.cross_encoder.predict(p, batch_size=BATCH_SIZE),
                    pairs
                ))
            
            # Phase 3: Verification (15s budget)
            evidence = []
            for idx, score in sorted(
                enumerate(rerank_scores),
                key=lambda x: x[1],
                reverse=True
            )[:TOP_K_EVIDENCE]:
                self._enforce_time_limit(start_time)
                
                # QA Extraction
                qa_inputs = self.qa_tokenizer(
                    claim,
                    documents[idx],
                    return_tensors="pt",
                    truncation=True,
                    max_length=512
                ).to(self.device)
                
                with torch.no_grad():
                    qa_outputs = self.qa_model(**qa_inputs)
                
                # NLI Classification
                nli_inputs = self.nli_tokenizer(
                    claim,
                    documents[idx],
                    return_tensors="pt",
                    truncation=True
                ).to(self.device)
                
                with torch.no_grad():
                    nli_outputs = self.nli_model(**nli_inputs)
                
                evidence.append({
                    "text": documents[idx],
                    "url": sources[idx],
                    "score": float(score),
                    "answer_start": int(qa_outputs.start_logits.argmax()),
                    "answer_end": int(qa_outputs.end_logits.argmax()),
                    "contradiction_prob": float(
                        torch.softmax(nli_outputs.logits, dim=1)[0][2]
                })
            
            # Determine verdict
            supported = [e for e in evidence if e["contradiction_prob"] < 0.5]
            refuted = [e for e in evidence if e["contradiction_prob"] >= 0.5]
            
            if not evidence:
                results["verdict"] = "Insufficient Evidence"
            elif supported and refuted:
                results["verdict"] = "Conflicting Evidence"
            elif supported:
                results["verdict"] = "Supported"
            else:
                results["verdict"] = "Refuted"
            
            results["evidence"] = evidence
            results["processing_time"] = time.time() - start_time
            
        except TimeoutError as e:
            logger.warning(str(e))
            results["error"] = "timeout"
        except Exception as e:
            logger.error(f"Unexpected error: {str(e)}")
            results["error"] = "processing_error"
        
        return results


# Example Usage
if __name__ == "__main__":
    # Initialize verifier
    verifier = OptimizedClaimVerifier()
    
    # Test data
    test_claim = "COVID-19 vaccines cause autism."
    test_docs = [
        "Multiple peer-reviewed studies have found no link between vaccines and autism.",
        "The CDC confirms vaccines are safe and effective with no connection to autism.",
        "A debunked 1998 study falsely claimed a vaccine-autism link.",
        "Vaccine ingredients have been thoroughly tested for safety."
    ]
    test_sources = [
        "https://pubmed.ncbi.nlm.nih.gov/123456",
        "https://www.cdc.gov/vaccinesafety",
        "https://retractionwatch.com/fake-study",
        "https://who.int/vaccine-facts"
    ]
    
    # Process claim with time monitoring
    start = time.time()
    result = verifier.process_claim(test_claim, test_docs, test_sources)
    elapsed = time.time() - start
    
    # Display results
    print(f"\n⏱ Total processing time: {elapsed:.2f}s")
    print(f"📋 Verdict: {result['verdict']}")
    
    print("\n🔍 Top Evidence:")
    for idx, ev in enumerate(result['evidence'][:3], 1):
        print(f"{idx}. Score: {ev['score']:.3f}")
        print(f"   Contradiction Prob: {ev['contradiction_prob']:.3f}")
        print(f"   Source: {ev['url']}")
        print(f"   Excerpt: {ev['text'][:100]}...\n")


# In[ ]:


#TRANSFORM IN DOCKER FILE
dockerfile
# Base image with support GPU and Python 3.10
FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime

# Avoid interaction during installation
ENV DEBIAN_FRONTEND=noninteractive

# Install system dependencies
RUN apt-get update && apt-get install -y \
    git \
    curl \
    wget \
    unzip \
    libgl1-mesa-glx \
    && rm -rf /var/lib/apt/lists/*

# Create work's directory
WORKDIR /app

# Copy file of the project
COPY . .

# Install Python dependencies
RUN pip install --no-cache-dir \
    torch torchvision torchaudio \
    pandas \
    numpy \
    sentence-transformers \
    faiss-cpu \
    transformers \
    scikit-learn

# In caso tu usi faiss con GPU, sostituisci `faiss-cpu` con `faiss-gpu`

# Entry point
CMD ["python", "UFFICIALE-NLP-VISUAL.ipynb"]
# oppure, se hai uno script `.py` equivalente
# CMD ["python", "inference_script.py"]
jupyter nbconvert --to script UFFICIALE-NLP-VISUAL.ipynb
docker build -t averitec-submission .
docker run --gpus all --rm averitec-submission


# In[ ]:


#INDICE FAISS on each document of the folder of knoweeledge_base
import os
import torch
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
from tqdm import tqdm

# Config
TOP_K = 5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

# Load embedding model
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1', device=DEVICE)

# Path to the knowledge base folder (replace this with the path to your folder)
kb_folder_path = "C:\\Users\\Gaetano\\Desktop\\NLP-PROJECT-everything\\dev_knowledge_store"

# Load the knowledge base (all JSONL files in the folder)
kb_data = []
for file_name in os.listdir(kb_folder_path):
    if file_name.endswith('.jsonl'):
        file_path = os.path.join(kb_folder_path, file_name)
        with open(file_path, 'r') as f:
            for line in f:
                kb_data.append(json.loads(line))

# Prepare documents (texts + metadata)
kb_passages = []
kb_metadata = []
for doc in kb_data:
    if "url2text" not in doc:
        continue
    # Concatenate all the sentences into one big text
    text = " ".join(doc["url2text"])
    kb_passages.append(text)
    kb_metadata.append({
        "url": doc.get("url", ""),
        "scraped_text": text
    })

# Compute embeddings for all KB passages
print("Computing embeddings for knowledge base...")
kb_embeddings = model.encode(kb_passages, show_progress_bar=True, convert_to_numpy=True)

# Build FAISS index
dimension = kb_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(kb_embeddings)  # normalize for cosine similarity
index.add(kb_embeddings)

# Load claims
with open("C:\Users\Gaetano\Desktop\DATASET_AVERITEC\train.txt.json") as f:
    claims_data = json.load(f)

# Prepare predictions
predictions = []

print("Retrieving top-k evidence for each claim...")
for example in tqdm(claims_data):
    claim_id = example["claim_id"]
    claim = example["claim"]

    # Encode the claim
    claim_embedding = model.encode([claim], convert_to_numpy=True)
    faiss.normalize_L2(claim_embedding)

    # Search top-k
    D, I = index.search(claim_embedding, TOP_K)

    evidence = []
    for idx in I[0]:
        passage = kb_metadata[idx]
        evidence.append({
            "question": claim,
            "answer": passage["scraped_text"],
            "url": passage["url"],
            "scraped_text": passage["scraped_text"]
        })

    predictions.append({
        "claim_id": claim_id,
        "claim": claim,
        "pred_label": "Not Enough Evidence",  # placeholder
        "evidence": evidence
    })

# Save output as JSONL
with open("retrieved_predictions.jsonl", "w") as fout:
    for item in predictions:
        fout.write(json.dumps(item) + "\n")

print("✅ Retrieval complete. Evidence stored in retrieved_predictions.jsonl")




# In[ ]:


#simple classifier necessary
import json
from tqdm import tqdm

# Simple heuristic-based classifier function
def classify_claim(claim, evidences):
    """
    A basic rule-based classifier using keyword heuristics.
    """
    claim_lower = claim.lower()
    combined_evidence = " ".join([e["answer"].lower() for e in evidences])

    # Example rules (extend or refine as needed)
    if any(word in combined_evidence for word in ["confirms", "proves", "shows that"]) and \
       any(word in claim_lower for word in ["is", "are", "was", "were"]):
        return "Supported"
    elif any(word in combined_evidence for word in ["denies", "contradicts", "refutes", "false"]):
        return "Refuted"
    else:
        return "Not Enough Evidence"  # fallback default

# Load retrieved evidence
input_file = "retrieved_predictions.jsonl"
output_file = "classified_predictions.jsonl"

classified_predictions = []

with open(input_file, "r") as fin:
    for line in tqdm(fin, desc="Classifying claims"):
        example = json.loads(line)
        claim = example["claim"]
        evidences = example.get("evidence", [])

        # Apply rule-based classification
        label = classify_claim(claim, evidences)
        example["pred_label"] = label
        classified_predictions.append(example)

# Save classified results
with open(output_file, "w") as fout:
    for example in classified_predictions:
        fout.write(json.dumps(example) + "\n")

print(f" Classification complete. Results saved to: {output_file}")


# In[ ]:


#evidence field as list of QA pairs required
import json
from tqdm import tqdm

# File paths
input_file = "classified_predictions.jsonl"
output_file = "formatted_submission.jsonl"

formatted_predictions = []

# Process each example to reformat evidence into QA pairs
with open(input_file, "r") as fin:
    for line in tqdm(fin, desc="Formatting QA pairs"):
        example = json.loads(line)
        claim = example["claim"]
        evidences = example.get("evidence", [])

        qa_pairs = []
        for ev in evidences:
            qa_pair = {
                "question": claim,  # use the claim as the question for now
                "answer": ev["scraped_text"],
                "url": ev.get("url", ""),
                "scraped_text": ev["scraped_text"]
            }
            qa_pairs.append(qa_pair)

        # Replace the evidence field with the list of QA pairs
        example["evidence"] = qa_pairs
        formatted_predictions.append(example)

# Save updated output
with open(output_file, "w") as fout:
    for example in formatted_predictions:
        fout.write(json.dumps(example) + "\n")

print(f"✅ Evidence formatting complete. Output saved to: {output_file}")


# In[ ]:


#creating evidence in a structured format import csv
import json
from typing import List, Dict

# Simulated input claim (normally this would be read from a .jsonl file)
input_claim = {
    "claim_id": "7",
    "type": "gold",
    "query": "_",
    "url": "https://web.archive.org/web/20210212182804/https://www.nytimes.com/2020/09/29/us/trump-750-taxes.html",
    "url2text": [
        "Trump Paid $750 in Federal Income Taxes in 2017. Here’s the Math.",
        "Figures drawn from President Trump’s tax-return data show how that number was calculated.",
        "The small amount of federal income taxes President Trump paid in both 2016 and 2017 — just $750 each year — has become the focus of much attention since it was revealed in a New York Times investigation.",
        "The figures below, drawn from Mr. Trump’s tax-return data for 2017, show how his accountants arrived at that figure for one of those years.",
        # (other sentences omitted for brevity)
        "But on the Form 3800 for the General Business Credit, his accountants subtracted $750 from his allowable credit.",
        "Why they did that is not clear. But the result was a total federal income tax liability of $750."
    ]
}

# Function to generate QA-style evidence from raw text
def build_structured_evidence(claim_id: str, claim_text: str, url: str, sentences: List[str]) -> Dict:
    evidence = []
    for i, sentence in enumerate(sentences):
        # Simplified question generation (real systems might use LLMs or templates)
        question = f"What evidence supports or refutes the claim: '{claim_text}'?"
        evidence_item = {
            "question": question,
            "answer": sentence,
            "url": url,
            "scraped_text": sentence
        }
        evidence.append(evidence_item)

    # Assemble full record with a placeholder label
    structured = {
        "claim_id": claim_id,
        "claim": claim_text,
        "pred_label": "SUPPORTED",  # This should be predicted by your model
        "evidence": evidence
    }
    return structured

# Create structured evidence
claim_text = "Donald Trump paid only $750 in federal income taxes in 2017."  # Hypothetical claim
structured_data = build_structured_evidence(
    claim_id=input_claim["claim_id"],
    claim_text=claim_text,
    url=input_claim["url"],
    sentences=input_claim["url2text"]
)

# Save to CSV in AVeriTeC format
def save_submission_to_csv(data: List[Dict], output_file: str = "submission.csv"):
    with open(output_file, mode="w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["claim_id", "claim", "pred_label", "evidence"])
        writer.writeheader()
        for item in data:
            # Evidence must be serialized as JSON inside the CSV field
            writer.writerow({
                "claim_id": item["claim_id"],
                "claim": item["claim"],
                "pred_label": item["pred_label"],
                "evidence": json.dumps(item["evidence"], ensure_ascii=False)
            })

# Save our one example
save_submission_to_csv([structured_data])


# In[ ]:


#automatic classification of the claim based on the evidence 
import csv
import json
from typing import List, Dict

# Simulated input data
input_claim = {
    "claim_id": "7",
    "type": "gold",
    "query": "_",
    "url": "https://web.archive.org/web/20210212182804/https://www.nytimes.com/2020/09/29/us/trump-750-taxes.html",
    "url2text": [
        "Trump Paid $750 in Federal Income Taxes in 2017. Here’s the Math.",
        "Figures drawn from President Trump’s tax-return data show how that number was calculated.",
        "The small amount of federal income taxes President Trump paid in both 2016 and 2017 — just $750 each year — has become the focus of much attention since it was revealed in a New York Times investigation.",
        "The figures below, drawn from Mr. Trump’s tax-return data for 2017, show how his accountants arrived at that figure for one of those years.",
        "But on the Form 3800 for the General Business Credit, his accountants subtracted $750 from his allowable credit.",
        "Why they did that is not clear. But the result was a total federal income tax liability of $750."
    ]
}

# Sample claim text (in real scenario, this would come from your dataset)
claim_text = "Donald Trump paid only $750 in federal income taxes in 2017."

# Step 1: Build structured evidence in QA format
def build_structured_evidence(claim_id: str, claim_text: str, url: str, sentences: List[str]) -> List[Dict]:
    evidence = []
    for sentence in sentences:
        question = f"What evidence supports or refutes the claim: '{claim_text}'?"
        evidence.append({
            "question": question,
            "answer": sentence,
            "url": url,
            "scraped_text": sentence
        })
    return evidence

# Step 2: Perform automatic classification of the claim based on evidence
def classify_claim(claim: str, evidence_sentences: List[str]) -> str:
    combined_evidence = " ".join(evidence_sentences).lower()
    claim_lower = claim.lower()

    # Simple heuristic rules (can be replaced with a transformer or LLM model)
    if "paid $750" in combined_evidence or "$750 in federal income taxes" in combined_evidence:
        return "SUPPORTED"
    elif "did not pay any taxes" in claim_lower and "paid" in combined_evidence:
        return "REFUTED"
    elif "not clear" in combined_evidence or "uncertain" in combined_evidence:
        return "NOT ENOUGH EVIDENCE"
    else:
        return "NOT ENOUGH EVIDENCE"

# Step 3: Compose one structured record
def process_claim_record(claim_data: Dict) -> Dict:
    evidence = build_structured_evidence(
        claim_id=claim_data["claim_id"],
        claim_text=claim_text,
        url=claim_data["url"],
        sentences=claim_data["url2text"]
    )
    predicted_label = classify_claim(claim_text, claim_data["url2text"])

    return {
        "claim_id": claim_data["claim_id"],
        "claim": claim_text,
        "pred_label": predicted_label,
        "evidence": evidence
    }

# Step 4: Write structured predictions to CSV
def save_submission_to_csv(data: List[Dict], output_file: str = "submission.csv"):
    with open(output_file, mode="w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["claim_id", "claim", "pred_label", "evidence"])
        writer.writeheader()
        for item in data:
            writer.writerow({
                "claim_id": item["claim_id"],
                "claim": item["claim"],
                "pred_label": item["pred_label"],
                "evidence": json.dumps(item["evidence"], ensure_ascii=False)
            })

# Run full pipeline on one claim
structured_claim = process_claim_record(input_claim)
save_submission_to_csv([structured_claim])


# In[ ]:


# csv export format
import json
import csv
from tqdm import tqdm

# === Configuration ===
INPUT_JSONL_PATH = "retrieved_predictions.jsonl"  # input JSONL file with claim predictions
OUTPUT_CSV_PATH = "submission.csv"                # output CSV for Hugging Face leaderboard

# === Load JSONL predictions ===
rows = []
with open(INPUT_JSONL_PATH, "r", encoding="utf-8") as infile:
    for line in tqdm(infile, desc="Reading JSONL predictions"):
        example = json.loads(line)
        claim_id = example.get("claim_id", "")
        claim = example.get("claim", "")
        pred_label = example.get("pred_label", "Not Enough Evidence")  # default fallback

        # Combine all answer texts from the evidence list
        evidence_items = example.get("evidence", [])
        combined_evidence = " ".join(ev.get("answer", "") for ev in evidence_items)

        # Prepare row for CSV
        rows.append({
            "claim_id": claim_id,
            "claim": claim,
            "pred_label": pred_label,
            "evidence": combined_evidence
        })

# === Write to CSV ===
with open(OUTPUT_CSV_PATH, "w", encoding="utf-8", newline="") as outfile:
    writer = csv.DictWriter(outfile, fieldnames=["claim_id", "claim", "pred_label", "evidence"])
    writer.writeheader()
    writer.writerows(rows)

print(f"✅ Export completed: {OUTPUT_CSV_PATH} is ready for the Hugging Face leaderboard.")



# In[ ]:


#time inference 
import time
import json

# Dummy function to simulate claim processing
# In practice, replace this with your actual model or processing logic
def process_claim(claim):
    # Simulating some processing time (this should be replaced by actual model inference)
    time.sleep(0.5)  # Simulates processing time (0.5 seconds)
    # Returning a dummy result for illustration
    return {"claim_id": claim['claim_id'], "label": "Supported", "evidence": "Some evidence"}

# Load your dataset (replace with the actual loading logic)
# Assuming the dataset is in a JSONL format with one claim per line.
def load_data(file_path):
    claims = []
    with open(file_path, 'r') as f:
        for line in f:
            claims.append(json.loads(line))  # Parse each line as a JSON object
    return claims

# Function to calculate inference time for each claim
def evaluate_inference_time(claims):
    # Iterate through each claim and measure the inference time
    for claim in claims:
        start_time = time.time()  # Record the start time
        
        # Process the claim (replace with your actual model inference code)
        result = process_claim(claim)
        
        end_time = time.time()  # Record the end time
        inference_time = end_time - start_time  # Calculate the inference time
        
        # Print or log the results
        print(f"Claim ID: {claim['claim_id']} - Inference Time: {inference_time:.4f} seconds")
        # You can also save the results to a file if needed, e.g., a CSV or log file.
        # Example of saving results:
        # with open('inference_times.csv', 'a') as f:
        #     f.write(f"{claim['claim_id']},{inference_time:.4f}\n")

# Main function to execute the evaluation
def main():
    # Load your dataset (replace with the actual file path)
    claims = load_data('C:\Users\Gaetano\Desktop\DATASET_AVERITEC\dev.json')
    
    # Evaluate the inference time for each claim
    evaluate_inference_time(claims)

# Run the main function
if __name__ == "__main__":
    main()


# In[ ]:


import time
import json
import numpy as np
from sklearn.metrics import recall_score
from collections import Counter

# Simulated knowledge base example
knowledge_base = {
    "7": {
        "url": "https://web.archive.org/web/20210212182804/https://www.nytimes.com/2020/09/29/us/trump-750-taxes.html",
        "url2text": [
            "Trump Paid $750 in Federal Income Taxes in 2017. Here’s the Math.",
            "Figures drawn from President Trump’s tax-return data show how that number was calculated.",
            "The small amount of federal income taxes President Trump paid in both 2016 and 2017 — just $750 each year — has become the focus of much attention since it was revealed in a New York Times investigation."
            # Add more evidence here
        ]
    }
    # You would continue to add more claims with their evidence here
}

# Simulated example claim and evidence
claims = [
    {
        "claim_id": "7",
        "claim": "Trump paid only $750 in federal income taxes in 2017.",
        "gold_evidence": [
            "Trump paid $750 in federal income taxes in 2017",
            "The small amount of federal income taxes President Trump paid in both 2016 and 2017"
            # You would continue with the rest of the annotated evidence
        ]
    }
]

def process_claim(claim, knowledge_base):
    """
    Process a single claim and retrieve evidence from the knowledge base.
    Simulated here for example purposes.
    """
    claim_id = claim["claim_id"]
    retrieved_evidence = knowledge_base[claim_id]["url2text"]  # Retrieved evidence from knowledge base

    # Simulate decision on claim - for simplicity, we'll assume it's supported if there's relevant evidence
    return retrieved_evidence

def calculate_ev2r_recall(gold_evidence, retrieved_evidence):
    """
    Calculate Ev2R recall score by comparing retrieved evidence against gold evidence.
    """
    # Convert the evidence lists into Counter (bag of words) representations
    gold_counter = Counter(gold_evidence)
    retrieved_counter = Counter(retrieved_evidence)

    # Calculate recall as the intersection of retrieved and gold evidence
    intersection = sum((gold_counter & retrieved_counter).values())

    # Recall: intersection / number of gold evidence items
    recall = intersection / len(gold_counter) if len(gold_counter) > 0 else 0.0
    return recall

def evaluate_claims(claims, knowledge_base):
    """
    Evaluate the claims using Ev2R recall and print the results.
    """
    total_recall = 0
    total_claims = len(claims)

    for claim in claims:
        start_time = time.time()  # Start timer for inference time

        # Retrieve evidence and process claim
        retrieved_evidence = process_claim(claim, knowledge_base)
        
        # Calculate Ev2R recall
        recall = calculate_ev2r_recall(claim["gold_evidence"], retrieved_evidence)
        total_recall += recall

        # Print inference time for each claim
        inference_time = time.time() - start_time
        print(f"Claim {claim['claim_id']} processed in {inference_time:.4f} seconds.")
        print(f"Ev2R Recall for claim {claim['claim_id']}: {recall:.4f}\n")
    
    # Calculate the average recall across all claims
    avg_recall = total_recall / total_claims if total_claims > 0 else 0.0
    print(f"Average Ev2R Recall: {avg_recall:.4f}")

# Run the evaluation
evaluate_claims(claims, knowledge_base)


# In[ ]:


#Taking into acocunt the whole knoweledge base
import json

# Define the knowledge base
class KnowledgeBase:
    def __init__(self):
        self.claims = {}

    def add_claim(self, claim_id, claim_data):
        """Add a claim to the knowledge base."""
        self.claims[claim_id] = claim_data

    def search_claims(self, query):
        """Search for claims based on a query."""
        results = []
        for claim_id, claim_data in self.claims.items():
            if query.lower() in claim_data['url2text']:
                results.append({
                    'claim_id': claim_id,
                    'url': claim_data['url'],
                    'claim_text': claim_data['url2text']
                })
        return results

    def display_claim(self, claim_id):
        """Display a specific claim based on the claim_id."""
        claim = self.claims.get(claim_id)
        if claim:
            print(f"Claim ID: {claim_id}")
            print(f"URL: {claim['url']}")
            print("Claim Text: ")
            for text in claim['url2text']:
                print(text)
        else:
            print("Claim not found.")

# Initialize the knowledge base
knowledge_base = KnowledgeBase()

# Add documents to the knowledge base
documents = [
    {
        "claim_id": "1",
        "type": "gold",
        "query": "_",
        "url": "https://www.tesla.com/about",
        "url2text": [
            "Tesla was founded in 2003 by engineers Martin Eberhard and Marc Tarpenning.",
            "The mission of Tesla is to accelerate the world's transition to sustainable energy."
        ]
    },
    {
        "claim_id": "2",
        "type": "gold",
        "query": "_",
        "url": "https://en.wikipedia.org/wiki/Elon_Musk",
        "url2text": [
            "Elon Musk was born on June 28, 1971, in Pretoria, South Africa.",
            "He is known for founding several technology companies including Tesla, SpaceX, and Neuralink."
        ]
    },
    {
        "claim_id": "3",
        "type": "gold",
        "query": "_",
        "url": "https://www.familytreedna.com/groups/nadar/about/background",
        "url2text": [
            "Nadar (also referred as Nadan, Shanar) is a caste of Tamil Nadu & Kerala, South India.",
            "The Nadars are an entrepreneurial south Indian caste and constitute 12% of Tamil Nadu's population."
        ]
    },
    {
        "claim_id": "4",
        "type": "gold",
        "query": "_",
        "url": "https://erau.edu/gaetz-aerospace-institute/about/don-gaetz",
        "url2text": [
            "Florida State Sen. Donald J. Gaetz (R-Destin) represents District 1 in the Florida Panhandle.",
            "Gaetz recently completed a two-year term as president of the Florida Senate."
        ]
    },
    {
        "claim_id": "5",
        "type": "gold",
        "query": "_",
        "url": "https://web.archive.org/web/20210302193538/https://www.theguardian.com/world/2020/apr/06/report-set-to-blame-syria-chemical-attacks-on-bashar-al-assad",
        "url2text": [
            "The UN’s chemical weapons watchdog is expected to release its first report explicitly blaming Bashar al-Assad for sarin and chlorine gas attacks on civilians in Syria."
        ]
    },
    {
        "claim_id": "6",
        "type": "gold",
        "query": "_",
        "url": "https://injuryfacts.nsc.org/home-and-community/safety-topics/covid-19-cases-in-the-united-states/",
        "url2text": [
            "As of April 30, 2022 data on this page is no longer being updated.",
            "The COVID-19 pandemic has impacted health and safety in many ways beyond the immediate impact of infections."
        ]
    },
    {
        "claim_id": "7",
        "type": "gold",
        "query": "_",
        "url": "https://web.archive.org/web/20210212182804/https://www.nytimes.com/2020/09/29/us/trump-750-taxes.html",
        "url2text": [
            "Trump paid $750 in federal income taxes in 2017. Here’s the math.",
            "The figures below, drawn from Mr. Trump’s tax-return data for 2017, show how his accountants arrived at that figure."
        ]
    }
]

# Populate the knowledge base
for document in documents:
    knowledge_base.add_claim(document['claim_id'], document)

# Function to display all claims
def display_all_claims():
    for claim_id in knowledge_base.claims:
        knowledge_base.display_claim(claim_id)

# Function to search for claims based on a query
def search_and_display_claims(query):
    results = knowledge_base.search_claims(query)
    if results:
        for result in results:
            print(f"Claim ID: {result['claim_id']}")
            print(f"URL: {result['url']}")
            print("Claim Text: ")
            for text in result['claim_text']:
                print(text)
            print("\n")
    else:
        print("No claims found for this query.")

# Example usage of the functions
print("All claims in the knowledge base:")
display_all_claims()

print("\nSearching for claims related to 'taxes':")
search_and_display_claims("taxes")

