Spaces:

jsdosanj
/

SikhLibrarian

Sleeping

App Files Files Community

SikhLibrarian / app.py

jsdosanj

Update app.py

ecd38cd verified 25 days ago

raw

history blame contribute delete

90.2 kB

	#!/usr/bin/env python3
	"""
	app.py — AI Sikh Librarian v22
	================================
	"Midnight Archive" — iOS 26 Glassmorphism redesign
	Gradio 6 fixes applied:
	- theme + css passed to launch(), not gr.Blocks()
	- bubble_full_width removed from gr.Chatbot()
	- font= list uses plain strings only (no GoogleFont mix — causes AttributeError)
	"""

	import gc, html, json, os, pickle, re, sqlite3, tempfile, threading, time
	from collections import OrderedDict
	from pathlib import Path
	from typing import Dict, Generator, List, Optional, Tuple

	import bm25s as bm25s_lib
	import faiss
	import gradio as gr
	import numpy as np
	from huggingface_hub import InferenceClient, hf_hub_download
	from rank_bm25 import BM25Okapi
	from sentence_transformers import SentenceTransformer

	try:
	from langdetect import detect as _langdetect
	from langdetect.lang_detect_exception import LangDetectException
	_LANGDETECT_AVAILABLE = True
	except ImportError:
	_LANGDETECT_AVAILABLE = False


	# ══════════════════════════════════════════════════════════════════════════════
	# 1. CONFIGURATION
	# ══════════════════════════════════════════════════════════════════════════════

	FREE_TIER = os.environ.get("FREE_TIER", "false").lower() == "true"

	LLM_MODEL = "Qwen/Qwen2.5-72B-Instruct"
	EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	STORAGE_REPO = "jsdosanj/SikhLibrarian-storage"
	STORAGE_SUBDIR = "index_output_v2"

	RELEVANCE_THRESHOLD = 0.40
	RESEARCH_FAISS_K = 600
	RESEARCH_BM25_K = 600
	MAX_RESEARCH_PASSAGES = 100
	PER_SOURCE_CAP = 15
	RRF_K = 60
	MMR_LAMBDA = 0.7
	MMR_TOP_K = 120

	LEARN_FAISS_K = 20
	LEARN_MODE_K = 5
	MAX_CONTEXT_WORDS = 1_400

	SNIPPET_WORDS = 80
	SNIPPET_STEP = 5

	MAX_QUERY_LEN = 500
	MIN_QUERY_LEN = 3
	MAX_SUBQUERIES = 3

	LLM_MAX_TOKENS = 2_048
	LLM_TEMPERATURE = 0.15
	LLM_TOP_P = 0.9
	LLM_MAX_RETRIES = 3
	LLM_RETRY_DELAYS = [5, 15, 30]

	EMBED_CACHE_SIZE = 10_000

	BUCKET_DIR = Path("/data") / STORAGE_SUBDIR
	CACHE_DIR = Path("./index_cache")
	INDEX_FILE = "faiss.index"
	SQLITE_FILE = "doc_store.sqlite"
	META_FILE = "meta.json"
	BM25_FILE = "bm25.pkl"
	ANALYTICS_DB = Path("/data/query_log.sqlite")

	CATEGORY_OPTIONS = ["All", "Gurbani", "Granths", "Steeks", "Literature", "Research"]

	KNOWN_SOURCES: List[str] = [
	"\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40 (112 \u0a2d\u0a3e\u0a38\u0a3c\u0a3e\u0a35\u0a3e\u0a02)",
	"\u0a38\u0a4d\u0a30\u0a40 \u0a26\u0a38\u0a2e \u0a17\u0a4d\u0a30\u0a70\u0a25",
	"\u0a38\u0a4d\u0a30\u0a40 \u0a38\u0a30\u0a2c\u0a32\u0a4b\u0a39 \u0a17\u0a4d\u0a30\u0a70\u0a25",
	"\u0a2d\u0a3e\u0a08 \u0a17\u0a41\u0a30\u0a26\u0a3e\u0a38 \u0a1c\u0a40 \u0a26\u0a40\u0a06\u0a02 \u0a35\u0a3e\u0a30\u0a3e\u0a02",
	"\u0a2d\u0a3e\u0a08 \u0a17\u0a41\u0a30\u0a26\u0a3e\u0a38 \u0a38\u0a3f\u0a70\u0a18 \u0a1c\u0a40 \u0a26\u0a40\u0a06\u0a02 \u0a15\u0a2c\u0a3f\u0a71\u0a24 \u0a38\u0a35\u0a71\u0a0d\u0a0f",
	"\u0a2e\u0a39\u0a3e\u0a28 \u0a15\u0a4b\u0a38\u0a3c",
	"\u0a38\u0a42\u0a30\u0a1c \u0a2a\u0a4d\u0a30\u0a15\u0a3e\u0a38\u0a3c",
	"\u0a2a\u0a70\u0a25 \u0a2a\u0a4d\u0a30\u0a15\u0a3e\u0a38\u0a3c",
	"\u0a17\u0a41\u0a30 \u0a2a\u0a4d\u0a30\u0a24\u0a3e\u0a2a \u0a38\u0a42\u0a30\u0a1c \u0a17\u0a4d\u0a30\u0a70\u0a25",
	"\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30 \u0a38\u0a4b\u0a2d\u0a3e",
	"\u0a2c\u0a70\u0a38\u0a3e\u0a35\u0a32\u0a40\u0a28\u0a3e\u0a2e\u0a3e",
	"\u0a2e\u0a39\u0a3f\u0a2e\u0a3e \u0a2a\u0a4d\u0a30\u0a15\u0a3e\u0a38\u0a3c",
	"\u0a38\u0a3c\u0a2c\u0a26\u0a3e\u0a30\u0a25 \u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40",
	"\u0a2b\u0a30\u0a40\u0a26\u0a15\u0a4b\u0a1f \u0a35\u0a3e\u0a32\u0a3e \u0a1f\u0a40\u0a15\u0a3e",
	"\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a26\u0a30\u0a2a\u0a23",
	"\u0a38\u0a39\u0a3f\u0a1c \u0a2a\u0a3e\u0a20 \u0a26\u0a40 \u0a38\u0a70\u0a25\u0a3f\u0a06",
	"\u0a1c\u0a28\u0a2e \u0a38\u0a3e\u0a16\u0a40 \u0a2d\u0a3e\u0a08 \u0a2e\u0a28\u0a40 \u0a38\u0a3f\u0a70\u0a18",
	"\u0a1c\u0a28\u0a2e \u0a38\u0a3e\u0a16\u0a40 \u0a35\u0a3e\u0a32\u0a40 \u0a35\u0a3e\u0a32\u0a3e",
	"\u0a30\u0a39\u0a3f\u0a24\u0a28\u0a3e\u0a2e\u0a47",
	"\u0a1c\u0a70\u0a17\u0a28\u0a3e\u0a2e\u0a47",
	"\u0a38\u0a3f\u0a71\u0a16 \u0a10\u0a28\u0a38\u0a3e\u0a08\u0a15\u0a32\u0a4b\u0a2a\u0a40\u0a21\u0a40\u0a06",
	"\u0a17\u0a41\u0a30\u0a26\u0a41\u0a06\u0a30\u0a3e \u0a21\u0a47\u0a1f\u0a3e\u0a2c\u0a47\u0a38",
	]


	# ══════════════════════════════════════════════════════════════════════════════
	# 2. GLOBAL STATE
	# ══════════════════════════════════════════════════════════════════════════════
	_vector_index: Optional[faiss.IndexFlatIP] = None
	_metadata_store: List[Dict] = []
	_sqlite_path: Optional[str] = None
	_embedder: Optional[SentenceTransformer] = None
	_llm_client: Optional[InferenceClient] = None

	_bm25_index: object = None
	_bm25_ids: List[int] = []
	_bm25_vocab: Dict[str, int] = {}
	_bm25_type: str = ""

	_source_facets = ["All Sources"] + sorted(KNOWN_SOURCES)
	_source_facets_lock = threading.Lock()

	_index_ready = threading.Event()
	_index_status = {"state": "starting", "progress": "", "error": None}


	# ══════════════════════════════════════════════════════════════════════════════
	# 3. LRU EMBED CACHE
	# ══════════════════════════════════════════════════════════════════════════════

	class _EmbedCache:
	def __init__(self, maxsize: int = EMBED_CACHE_SIZE) -> None:
	self._cache = OrderedDict()
	self._maxsize = maxsize
	self._lock = threading.Lock()
	self._hits = 0
	self._misses = 0

	def get(self, key: str) -> Optional[np.ndarray]:
	with self._lock:
	if key in self._cache:
	self._cache.move_to_end(key)
	self._hits += 1
	return self._cache[key].copy()
	self._misses += 1
	return None

	def put(self, key: str, value: np.ndarray) -> None:
	with self._lock:
	if key in self._cache:
	self._cache.move_to_end(key)
	else:
	if len(self._cache) >= self._maxsize:
	self._cache.popitem(last=False)
	self._cache[key] = value.copy()

	@property
	def stats(self) -> Dict:
	with self._lock:
	total = self._hits + self._misses
	hr = round(self._hits / total * 100, 1) if total else 0.0
	return {"hit_rate": f"{hr}%", "size": len(self._cache), "capacity": self._maxsize}


	_embed_cache = _EmbedCache(maxsize=EMBED_CACHE_SIZE)


	# ══════════════════════════════════════════════════════════════════════════════
	# 4. LANGUAGE DETECTION
	# ══════════════════════════════════════════════════════════════════════════════

	_LANG_NAMES: Dict[str, str] = {
	"en": "English", "pa": "Punjabi", "hi": "Hindi", "ur": "Urdu",
	"ro": "Romanian", "fr": "French", "de": "German", "es": "Spanish",
	"it": "Italian", "pt": "Portuguese", "nl": "Dutch", "pl": "Polish",
	"ru": "Russian", "tr": "Turkish", "ar": "Arabic", "fa": "Persian",
	"zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)",
	"ja": "Japanese", "ko": "Korean", "sv": "Swedish", "no": "Norwegian",
	"da": "Danish", "fi": "Finnish", "cs": "Czech", "sk": "Slovak",
	"hu": "Hungarian", "bg": "Bulgarian", "hr": "Croatian", "sr": "Serbian",
	"uk": "Ukrainian", "el": "Greek", "he": "Hebrew", "th": "Thai",
	"bn": "Bengali", "ta": "Tamil", "te": "Telugu", "mr": "Marathi",
	"gu": "Gujarati", "kn": "Kannada", "ml": "Malayalam",
	}
	_GURMUKHI_RE = re.compile(r"[\u0A00-\u0A7F]")


	def _detect_language(text: str) -> str:
	if not text or len(text.strip()) < 10:
	return ""
	gk = len(_GURMUKHI_RE.findall(text))
	if gk / max(len(text), 1) > 0.25:
	return "Punjabi (Gurmukhi)"
	if not _LANGDETECT_AVAILABLE:
	return ""
	try:
	code = _langdetect(text[:300])
	return _LANG_NAMES.get(code, code.upper())
	except Exception:
	return ""


	# ══════════════════════════════════════════════════════════════════════════════
	# 5. INPUT SANITISATION
	# ══════════════════════════════════════════════════════════════════════════════

	_ALLOWED_RE = re.compile(
	r"[^\w\s.,;:!?()\'\"\-\u2013\u2014/\u0A00-\u0A7F\u0900-\u097F\u00C0-\u024F]",
	re.UNICODE,
	)
	_ANG_FULL_RE = re.compile(r"(\u0a05\u0a70\u0a17[\s\u00A0]*\d+)", re.UNICODE)
	_ANG_NUM_RE = re.compile(r"\u0a05\u0a70\u0a17[\s\u00A0]*(\d+)", re.UNICODE)
	_ANG_IN_TEXT_RE = re.compile(r"\u0a05\u0a70\u0a17[\s\u00A0]*(\d+)", re.UNICODE)


	def _sanitize(text: str) -> str:
	if not isinstance(text, str):
	return ""
	text = html.unescape(text)
	text = _ALLOWED_RE.sub(" ", text)
	return re.sub(r"\s+", " ", text).strip()[:MAX_QUERY_LEN]


	# ══════════════════════════════════════════════════════════════════════════════
	# 6. QUERY EXPANSION
	# ══════════════════════════════════════════════════════════════════════════════

	_QUERY_EXPANSIONS: Dict[str, List[str]] = {
	"waheguru": ["\u0a35\u0a3e\u0a39\u0a3f\u0a17\u0a41\u0a30\u0a42"],
	"wahiguru": ["\u0a35\u0a3e\u0a39\u0a3f\u0a17\u0a41\u0a30\u0a42"],
	"waheguruji": ["\u0a35\u0a3e\u0a39\u0a3f\u0a17\u0a41\u0a30\u0a42"],
	"wahe guru": ["\u0a35\u0a3e\u0a39\u0a3f\u0a17\u0a41\u0a30\u0a42"],
	"guru granth": ["\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40", "\u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25"],
	"guru granth sahib": ["\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40"],
	"sggs": ["\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40"],
	"japji": ["\u0a1c\u0a2a\u0a41 \u0a1c\u0a40 \u0a38\u0a3e\u0a39\u0a3f\u0a2c", "\u0a1c\u0a2a\u0a41\u0a1c\u0a40"],
	"japji sahib": ["\u0a1c\u0a2a\u0a41 \u0a1c\u0a40 \u0a38\u0a3e\u0a39\u0a3f\u0a2c"],
	"naam simran": ["\u0a28\u0a3e\u0a2e \u0a38\u0a3f\u0a2e\u0a30\u0a28"],
	"nam simran": ["\u0a28\u0a3e\u0a2e \u0a38\u0a3f\u0a2e\u0a30\u0a28"],
	"naam": ["\u0a28\u0a3e\u0a2e"],
	"simran": ["\u0a38\u0a3f\u0a2e\u0a30\u0a28"],
	"seva": ["\u0a38\u0a47\u0a35\u0a3e"],
	"sewa": ["\u0a38\u0a47\u0a35\u0a3e"],
	"ardas": ["\u0a05\u0a30\u0a26\u0a3e\u0a38"],
	"amritvela": ["\u0a05\u0a70\u0a2e\u0a4d\u0a30\u0a3f\u0a24 \u0a35\u0a47\u0a32\u0a3e"],
	"amrit vela": ["\u0a05\u0a70\u0a2e\u0a4d\u0a30\u0a3f\u0a24 \u0a35\u0a47\u0a32\u0a3e"],
	"amrit": ["\u0a05\u0a70\u0a2e\u0a4d\u0a30\u0a3f\u0a24"],
	"sant sipahi": ["\u0a38\u0a70\u0a24-\u0a38\u0a3f\u0a2a\u0a3e\u0a39\u0a40"],
	"sant-sipahi": ["\u0a38\u0a70\u0a24-\u0a38\u0a3f\u0a2a\u0a3e\u0a39\u0a40"],
	"chardi kala": ["\u0a1a\u0a5c\u0a4d\u0a39\u0a26\u0a40 \u0a15\u0a32\u0a3e"],
	"khalsa": ["\u0a16\u0a3c\u0a3e\u0a32\u0a38\u0a3e", "\u0a16\u0a3e\u0a32\u0a38\u0a3e"],
	"akal purakh": ["\u0a05\u0a15\u0a3e\u0a32 \u0a2a\u0a41\u0a30\u0a16"],
	"akaal purakh": ["\u0a05\u0a15\u0a3e\u0a32 \u0a2a\u0a41\u0a30\u0a16"],
	"gurbani": ["\u0a17\u0a41\u0a30\u0a2c\u0a3e\u0a23\u0a40"],
	"shabad": ["\u0a38\u0a3c\u0a2c\u0a26", "\u0a38\u0a2c\u0a26"],
	"shabads": ["\u0a38\u0a3c\u0a2c\u0a26", "\u0a38\u0a2c\u0a26"],
	"dasam granth": ["\u0a38\u0a4d\u0a30\u0a40 \u0a26\u0a38\u0a2e \u0a17\u0a4d\u0a30\u0a70\u0a25"],
	"dasam": ["\u0a38\u0a4d\u0a30\u0a40 \u0a26\u0a38\u0a2e \u0a17\u0a4d\u0a30\u0a70\u0a25"],
	"sarbloh": ["\u0a38\u0a4d\u0a30\u0a40 \u0a38\u0a30\u0a2c\u0a32\u0a4b\u0a39 \u0a17\u0a4d\u0a30\u0a70\u0a25"],
	"nitnem": ["\u0a28\u0a3f\u0a24\u0a28\u0a47\u0a2e"],
	"rehras": ["\u0a30\u0a39\u0a3f\u0a30\u0a3e\u0a38 \u0a38\u0a3e\u0a39\u0a3f\u0a2c"],
	"kirtan sohila": ["\u0a15\u0a40\u0a30\u0a24\u0a28 \u0a38\u0a4b\u0a39\u0a3f\u0a32\u0a3e"],
	"anand sahib": ["\u0a06\u0a28\u0a70\u0a26 \u0a38\u0a3e\u0a39\u0a3f\u0a2c"],
	"mool mantar": ["\u0a2e\u0a42\u0a32 \u0a2e\u0a70\u0a24\u0a30"],
	"mool mantra": ["\u0a2e\u0a42\u0a32 \u0a2e\u0a70\u0a24\u0a30"],
	"ik onkar": ["\u0a74", "\u0a07\u0a71\u0a15 \u0a13\u0a05\u0a70\u0a15\u0a3e\u0a30"],
	"haumai": ["\u0a39\u0a09\u0a2e\u0a48"],
	"hukam": ["\u0a39\u0a41\u0a15\u0a2e"],
	"nadar": ["\u0a28\u0a26\u0a30\u0a3f"],
	"kirpa": ["\u0a15\u0a3f\u0a30\u0a2a\u0a3e"],
	"grace": ["\u0a15\u0a3f\u0a30\u0a2a\u0a3e", "\u0a28\u0a26\u0a30\u0a3f"],
	"death": ["mortality", "passing", "\u0a2e\u0a4c\u0a24"],
	"fear": ["anxiety", "dread", "\u0a21\u0a30"],
	"mental health": ["inner peace", "wellbeing", "\u0a2e\u0a28"],
	"courage": ["bravery", "fearlessness", "\u0a39\u0a3f\u0a70\u0a2e\u0a24"],
	"love": ["devotion", "bhakti", "\u0a2a\u0a4d\u0a30\u0a47\u0a2e", "\u0a2d\u0a17\u0a24\u0a40"],
	"meditation": ["contemplation", "\u0a38\u0a3f\u0a2e\u0a30\u0a28", "\u0a27\u0a3f\u0a06\u0a28"],
	"prayer": ["supplication", "\u0a05\u0a30\u0a26\u0a3e\u0a38", "\u0a2a\u0a4d\u0a30\u0a3e\u0a30\u0a25\u0a28\u0a3e"],
	"equality": ["justice", "equity", "\u0a38\u0a2e\u0a3e\u0a28\u0a24\u0a3e"],
	"martyrdom": ["sacrifice", "shaheedi", "\u0a38\u0a3c\u0a39\u0a40\u0a26\u0a40"],
	"sovereignty": ["miri piri", "\u0a2e\u0a40\u0a30\u0a40 \u0a2a\u0a40\u0a30\u0a40"],
	"langar": ["\u0a32\u0a70\u0a17\u0a30", "community kitchen"],
	"gurdwara": ["\u0a17\u0a41\u0a30\u0a26\u0a41\u0a06\u0a30\u0a3e"],
	"gurdwaras": ["\u0a17\u0a41\u0a30\u0a26\u0a41\u0a06\u0a30\u0a47", "\u0a17\u0a41\u0a30\u0a26\u0a41\u0a06\u0a30\u0a3e"],
	}


	def _expand_query(query: str) -> str:
	q_lower = query.lower()
	expansions = []
	for trigger, terms in _QUERY_EXPANSIONS.items():
	if trigger in q_lower:
	expansions.extend(terms)
	if not expansions:
	return query
	existing = set(query.split())
	new_terms = [t for t in expansions if t not in existing]
	return (query + " " + " ".join(new_terms)) if new_terms else query


	# ══════════════════════════════════════════════════════════════════════════════
	# 7. CLIENT INITIALISATION
	# ══════════════════════════════════════════════════════════════════════════════

	def _init_llm_client() -> None:
	global _llm_client
	token = os.environ.get("HF_TOKEN")
	if not token:
	raise EnvironmentError("HF_TOKEN not set.")
	_llm_client = InferenceClient(model=LLM_MODEL, token=token)
	print(f"\u2705 LLM client ready \u2014 {LLM_MODEL}")


	def _init_embedder() -> None:
	global _embedder
	print(f"Loading embedding model ({EMBEDDING_MODEL})...")
	_embedder = SentenceTransformer(EMBEDDING_MODEL, device="cpu")
	_embedder.encode(["warmup"], convert_to_numpy=True)
	print("\u2705 Embedding model ready")


	# ══════════════════════════════════════════════════════════════════════════════
	# 8. INDEX LOADER
	# ══════════════════════════════════════════════════════════════════════════════

	def _resolve_index_dir() -> Optional[Path]:
	for candidate in [BUCKET_DIR, CACHE_DIR]:
	if (candidate / INDEX_FILE).exists() and (candidate / META_FILE).exists():
	print(f"\U0001f4c1 Index found at: {candidate}")
	return candidate
	return None


	def _download_from_hub(token: str) -> Path:
	CACHE_DIR.mkdir(parents=True, exist_ok=True)
	print(f"Downloading from {STORAGE_REPO}/{STORAGE_SUBDIR}...")
	for fname in [INDEX_FILE, SQLITE_FILE, META_FILE, BM25_FILE]:
	_index_status["progress"] = f"Downloading {fname}..."
	try:
	hf_hub_download(
	repo_id=STORAGE_REPO, filename=f"{STORAGE_SUBDIR}/{fname}",
	repo_type="dataset", local_dir=str(CACHE_DIR), token=token,
	)
	src = CACHE_DIR / STORAGE_SUBDIR / fname
	dst = CACHE_DIR / fname
	if src.exists() and not dst.exists():
	src.rename(dst)
	print(f" \u2705 {fname}")
	except Exception as e:
	print(f" \u26a0\ufe0f Could not download {fname}: {e}")
	return CACHE_DIR


	def _build_bm25_index(idx_dir: Path) -> None:
	global _bm25_index, _bm25_ids, _bm25_vocab, _bm25_type
	pkl_path = idx_dir / BM25_FILE
	if pkl_path.exists():
	size_mb = pkl_path.stat().st_size / 1_048_576
	_index_status["progress"] = "Loading pre-built BM25 index..."
	print(f"\U0001f4e6 Loading pre-built BM25 index ({size_mb:.0f} MB)...")
	t0 = time.time()
	try:
	with open(pkl_path, "rb") as f:
	payload = pickle.load(f)
	_bm25_index = payload["retriever"]
	_bm25_ids = payload["ids"]
	_bm25_vocab = payload.get("vocab", {})
	_bm25_type = "bm25s"
	print(f"\u2705 BM25 loaded in {time.time()-t0:.1f}s \u2014 {len(_bm25_ids):,} docs")
	return
	except Exception as e:
	print(f"\u26a0\ufe0f bm25.pkl load failed ({e}) \u2014 rebuilding from SQLite")

	if _sqlite_path is None:
	print("\u26a0\ufe0f BM25 skipped \u2014 no pkl and no SQLite.")
	return

	_index_status["progress"] = "Building BM25 from SQLite..."
	print("\U0001f4e6 Building BM25 from SQLite...")
	t0 = time.time()
	try:
	conn = sqlite3.connect(_sqlite_path, check_same_thread=False)
	rows = conn.execute("SELECT id, text FROM chunks ORDER BY id").fetchall()
	conn.close()
	except Exception as e:
	print(f"\u26a0\ufe0f BM25 build failed: {e}")
	return
	_bm25_ids = [r[0] for r in rows]
	_bm25_index = BM25Okapi([r[1].split() for r in rows])
	_bm25_vocab = {}
	_bm25_type = "rank-bm25"
	print(f"\u2705 BM25 (rank-bm25) built in {time.time()-t0:.1f}s")


	def _build_source_facets() -> None:
	global _source_facets
	seen = set()
	for m in _metadata_store:
	name = m.get("display_name", "")
	if name:
	seen.add(name)
	all_sources = seen \| set(KNOWN_SOURCES)
	with _source_facets_lock:
	_source_facets = ["All Sources"] + sorted(all_sources)
	print(f"\u2705 Source facets ready \u2014 {len(_source_facets)-1} sources")


	def _load_index_background() -> None:
	global _vector_index, _metadata_store, _sqlite_path
	token = os.environ.get("HF_TOKEN", "")
	_index_status["state"] = "loading"
	try:
	_index_status["progress"] = "Loading embedding model..."
	_init_embedder()
	idx_dir = _resolve_index_dir() or _download_from_hub(token)

	_index_status["progress"] = "Loading FAISS index..."
	print("\U0001f4e6 Loading FAISS index...")
	_vector_index = faiss.read_index(str(idx_dir / INDEX_FILE))

	_index_status["progress"] = "Loading metadata..."
	with open(idx_dir / META_FILE, "r", encoding="utf-8") as fh:
	_metadata_store = json.load(fh)

	sqlite_candidate = idx_dir / SQLITE_FILE
	if sqlite_candidate.exists():
	_sqlite_path = str(sqlite_candidate)
	print(f"\u2705 SQLite ready: {_sqlite_path}")
	else:
	_sqlite_path = None
	print("\u26a0\ufe0f doc_store.sqlite not found")

	_build_bm25_index(idx_dir)
	_build_source_facets()

	n = len(_metadata_store)
	gc.collect()
	print(f"\u2705 All indexes ready \u2014 {n:,} chunks (FREE_TIER={FREE_TIER}, BM25={_bm25_type})")
	_index_status["state"] = "ready"
	_index_status["progress"] = f"{n:,} chunks indexed"
	_index_ready.set()

	except Exception as exc:
	_index_status["state"] = "error"
	_index_status["error"] = str(exc)
	print(f"\u274c Index load failed: {exc}")
	import traceback; traceback.print_exc()
	_index_ready.set()


	# ══════════════════════════════════════════════════════════════════════════════
	# 9. SQLITE ACCESS
	# ══════════════════════════════════════════════════════════════════════════════

	def _fetch_chunks_batch(ids: List[int]) -> Dict[int, str]:
	if not ids or _sqlite_path is None:
	return {}
	try:
	conn = sqlite3.connect(_sqlite_path, check_same_thread=False)
	ph = ",".join("?" * len(ids))
	rows = conn.execute(
	f"SELECT id, text FROM chunks WHERE id IN ({ph})", ids
	).fetchall()
	conn.close()
	return {r[0]: r[1] for r in rows}
	except Exception as e:
	print(f"\u26a0\ufe0f SQLite fetch: {e}")
	return {}


	# ══════════════════════════════════════════════════════════════════════════════
	# 10. EMBEDDING
	# ══════════════════════════════════════════════════════════════════════════════

	def _embed(query: str) -> np.ndarray:
	cached = _embed_cache.get(query)
	if cached is not None:
	return cached
	vec = _embedder.encode(
	[query], convert_to_numpy=True, normalize_embeddings=True
	).astype(np.float32)
	_embed_cache.put(query, vec)
	return vec


	# ══════════════════════════════════════════════════════════════════════════════
	# 11. BM25 SEARCH
	# ══════════════════════════════════════════════════════════════════════════════

	def _bm25_search(
	query: str, category_filter: str, source_filter: str, k: int = RESEARCH_BM25_K,
	) -> List[Dict]:
	if _bm25_index is None or not _bm25_ids:
	return []
	try:
	if _bm25_type == "bm25s":
	query_tokens = bm25s_lib.tokenize(
	[query], stopwords=None, stemmer=None, show_progress=False,
	)
	k_actual = min(k, len(_bm25_ids))
	results, scores = _bm25_index.retrieve(
	query_tokens, k=k_actual, show_progress=False,
	)
	hits: List[Dict] = []
	for pos, score in zip(results[0], scores[0]):
	if score <= 0:
	continue
	chunk_id = _bm25_ids[int(pos)]
	if chunk_id >= len(_metadata_store):
	continue
	meta = _metadata_store[chunk_id]
	if category_filter != "All" and meta.get("category") != category_filter:
	continue
	if source_filter != "All Sources" and meta.get("display_name") != source_filter:
	continue
	hits.append({"idx": chunk_id, "score": float(score), "meta": meta})
	return hits

	tokens = query.split()
	scores_arr = _bm25_index.get_scores(tokens)
	if len(scores_arr) <= k:
	top_pos = np.argsort(scores_arr)[::-1]
	else:
	top_pos = np.argpartition(scores_arr, -k)[-k:]
	top_pos = top_pos[np.argsort(scores_arr[top_pos])[::-1]]
	hits = []
	for pos in top_pos:
	if scores_arr[pos] <= 0:
	break
	chunk_id = _bm25_ids[int(pos)]
	if chunk_id >= len(_metadata_store):
	continue
	meta = _metadata_store[chunk_id]
	if category_filter != "All" and meta.get("category") != category_filter:
	continue
	if source_filter != "All Sources" and meta.get("display_name") != source_filter:
	continue
	hits.append({"idx": chunk_id, "score": float(scores_arr[pos]), "meta": meta})
	return hits

	except Exception as e:
	print(f"\u26a0\ufe0f BM25 search: {e}")
	return []


	# ══════════════════════════════════════════════════════════════════════════════
	# 12. FAISS VECTOR SEARCH
	# ══════════════════════════════════════════════════════════════════════════════

	def _faiss_search(
	query_vec: np.ndarray, category_filter: str, source_filter: str, k: int,
	) -> List[Dict]:
	k_actual = min(k, _vector_index.ntotal)
	scores, indices = _vector_index.search(query_vec, k_actual)
	hits: List[Dict] = []
	for idx, score in zip(indices[0], scores[0]):
	if idx < 0:
	continue
	meta = _metadata_store[idx] if idx < len(_metadata_store) else {}
	if category_filter != "All" and meta.get("category") != category_filter:
	continue
	if source_filter != "All Sources" and meta.get("display_name") != source_filter:
	continue
	hits.append({"idx": int(idx), "score": float(score), "meta": meta})
	return hits


	# ══════════════════════════════════════════════════════════════════════════════
	# 13. RECIPROCAL RANK FUSION
	# ══════════════════════════════════════════════════════════════════════════════

	def _rrf_fuse(list_a: List[Dict], list_b: List[Dict], k: int = RRF_K) -> List[Dict]:
	rrf: Dict[int, float] = {}
	for rank, h in enumerate(list_a, 1):
	rrf[h["idx"]] = rrf.get(h["idx"], 0.0) + 1.0 / (k + rank)
	for rank, h in enumerate(list_b, 1):
	rrf[h["idx"]] = rrf.get(h["idx"], 0.0) + 1.0 / (k + rank)
	meta_map = {h["idx"]: h["meta"] for h in list_a + list_b}
	return sorted(
	[{"idx": i, "score": s, "meta": meta_map[i]} for i, s in rrf.items()],
	key=lambda x: x["score"], reverse=True,
	)


	# ══════════════════════════════════════════════════════════════════════════════
	# 14. MULTI-QUERY DECOMPOSITION
	# ══════════════════════════════════════════════════════════════════════════════

	_SPLIT_RE = re.compile(
	r"\b(?:and\|vs\.?\|versus\|compared to\|in relation to\|"
	r"as well as\|along with\|both\|also\|additionally)\b",
	re.IGNORECASE,
	)


	def _multi_query_search(
	query: str, category_filter: str, source_filter: str,
	k_per_subquery: int = 200,
	) -> List[Dict]:
	parts = [p.strip() for p in _SPLIT_RE.split(query) if len(p.strip()) > MIN_QUERY_LEN]
	if len(parts) <= 1:
	expanded = _expand_query(query)
	q_vec = _embed(expanded)
	vec_hits = _faiss_search(q_vec, category_filter, source_filter, k_per_subquery)
	bm25_hits = _bm25_search(expanded, category_filter, source_filter, k_per_subquery)
	return _rrf_fuse(vec_hits, bm25_hits)
	print(f"[multi-query] {len(parts)} sub-queries")
	fused: List[Dict] = []
	for part in parts[:MAX_SUBQUERIES]:
	expanded = _expand_query(part)
	q_vec = _embed(expanded)
	vec_hits = _faiss_search(q_vec, category_filter, source_filter, k_per_subquery)
	bm25_hits = _bm25_search(expanded, category_filter, source_filter, k_per_subquery)
	sub = _rrf_fuse(vec_hits, bm25_hits)
	fused = sub if not fused else _rrf_fuse(fused, sub)
	return fused


	# ══════════════════════════════════════════════════════════════════════════════
	# 15. MAXIMAL MARGINAL RELEVANCE
	# ══════════════════════════════════════════════════════════════════════════════

	def _mmr_rerank(
	query_vec: np.ndarray, hits: List[Dict],
	top_k: int = MMR_TOP_K, lambda_: float = MMR_LAMBDA,
	) -> List[Dict]:
	if FREE_TIER or not hits or len(hits) <= 1:
	return hits[:top_k]
	try:
	chunk_vecs: Dict[int, np.ndarray] = {}
	for h in hits:
	idx = h["idx"]
	if idx < _vector_index.ntotal:
	vec = np.zeros((1, _vector_index.d), dtype=np.float32)
	_vector_index.reconstruct(idx, vec[0])
	chunk_vecs[idx] = vec
	except Exception as e:
	print(f"\u26a0\ufe0f MMR skipped: {e}")
	return hits[:top_k]

	q_flat = query_vec.flatten()
	candidates = [h for h in hits if h["idx"] in chunk_vecs]
	selected: List[Dict] = []

	while candidates and len(selected) < top_k:
	if not selected:
	best = candidates[0]
	else:
	sel_vecs = np.vstack([chunk_vecs[s["idx"]].flatten() for s in selected])
	best_score = -np.inf
	best = candidates[0]
	for h in candidates:
	v_flat = chunk_vecs[h["idx"]].flatten()
	relevance = float(np.dot(q_flat, v_flat))
	redundancy = float(np.max(sel_vecs @ v_flat))
	mmr_score = lambda_ * relevance - (1.0 - lambda_) * redundancy
	if mmr_score > best_score:
	best_score = mmr_score
	best = h
	selected.append(best)
	candidates.remove(best)
	return selected


	# ══════════════════════════════════════════════════════════════════════════════
	# 16. SCORE CALIBRATION
	# ══════════════════════════════════════════════════════════════════════════════

	def _calibrate_scores(hits: List[Dict]) -> List[Dict]:
	if not hits:
	return hits
	scores = np.array([h["score"] for h in hits], dtype=np.float32)
	max_s = float(scores.max())
	min_s = float(scores.min())
	p75 = float(np.percentile(scores, 75))
	p50 = float(np.percentile(scores, 50))
	p25 = float(np.percentile(scores, 25))
	span = max_s - min_s if max_s > min_s else 1.0
	for h in hits:
	s = h["score"]
	h["relevance_pct"] = max(1, round(((s - min_s) / span) * 100))
	h["tier"] = (
	"\U0001f7e2 Highly Relevant" if s >= p75 else
	"\U0001f535 Relevant" if s >= p50 else
	"\U0001f7e1 Contextual" if s >= p25 else
	"\u26aa Peripheral"
	)
	return hits


	# ══════════════════════════════════════════════════════════════════════════════
	# 17. PER-SOURCE DIVERSITY CAP
	# ══════════════════════════════════════════════════════════════════════════════

	def _apply_source_cap(hits: List[Dict], source_filter: str) -> List[Dict]:
	if source_filter != "All Sources":
	return hits
	counts: Dict[str, int] = {}
	result: List[Dict] = []
	for h in hits:
	name = h["meta"].get("display_name", "unknown")
	if counts.get(name, 0) < PER_SOURCE_CAP:
	result.append(h)
	counts[name] = counts.get(name, 0) + 1
	return result


	# ══════════════════════════════════════════════════════════════════════════════
	# 18. KEYWORD-DENSITY SNIPPET
	# ══════════════════════════════════════════════════════════════════════════════

	def _best_snippet_window(text: str, query: str, window_words: int = SNIPPET_WORDS) -> str:
	query_tokens = set(query.lower().split())
	words = text.split()
	if len(words) <= window_words:
	return text
	best_score, best_start = -1, 0
	for start in range(0, len(words) - window_words, SNIPPET_STEP):
	score = sum(1 for w in words[start:start + window_words] if w.lower() in query_tokens)
	if score > best_score:
	best_score, best_start = score, start
	snippet = " ".join(words[best_start : best_start + window_words])
	if best_start > 0:
	snippet = "..." + snippet
	if best_start + window_words < len(words):
	snippet += "..."
	return snippet


	# ══════════════════════════════════════════════════════════════════════════════
	# 19. ਅੰਗ DEEP-LINK
	# ══════════════════════════════════════════════════════════════════════════════

	def _ang_deep_link(text: str, category: str) -> Optional[str]:
	if category not in ("Gurbani", "\u0a17\u0a41\u0a30\u0a2c\u0a3e\u0a23\u0a40"):
	return None
	m = _ANG_IN_TEXT_RE.search(text)
	return f"https://www.sikhitothemax.org/ang?ang={m.group(1)}&source=G" if m else None


	# ══════════════════════════════════════════════════════════════════════════════
	# 20. STRUCTURED PASSAGE EXTRACTOR
	# ══════════════════════════════════════════════════════════════════════════════

	_RE_GURMUKHI_F = re.compile(r"['\"]gurmukhi['\"]:\s*['\"]([^'\"]{2,}?)['\"]")
	_RE_PRONUN = re.compile(r"['\"]pronunciation['\"]:\s*['\"]([^'\"]{2,}?)['\"]")
	_RE_TRANSLATION = re.compile(r"['\"]translation['\"]:\s*['\"]([^'\"]{2,}?)['\"]")
	_RE_EXPLANATION = re.compile(r"['\"]explanation['\"]:\s['\"](.{10,}?)['\"](?:\s[,}])")
	_RE_ANG_FIELD = re.compile(r"['\"]ang['\"]:\s*(\d+)")


	def _extract_structured_passage(text: str, query: str) -> Dict:
	if "'gurmukhi'" not in text and '"gurmukhi"' not in text:
	return {"type": "plain", "snippet": _best_snippet_window(text, query)}
	try:
	gurmukhi_list = _RE_GURMUKHI_F.findall(text)
	pronun_list = _RE_PRONUN.findall(text)
	translation_list = _RE_TRANSLATION.findall(text)
	explanation_list = _RE_EXPLANATION.findall(text)
	ang_list = _RE_ANG_FIELD.findall(text)
	if not gurmukhi_list:
	return {"type": "plain", "snippet": _best_snippet_window(text, query)}
	n = len(gurmukhi_list)
	def _s(lst, i): return lst[i] if i < len(lst) else ""
	q_tokens = set(query.lower().split())
	best_i, best_score = 0, -1
	for i in range(n):
	combined = (gurmukhi_list[i] + " " + _s(pronun_list, i) + " " +
	_s(translation_list, i) + " " + _s(explanation_list, i)).lower()
	score = sum(1 for tok in q_tokens if tok in combined)
	if score > best_score:
	best_score, best_i = score, i
	explanation = _s(explanation_list, best_i)
	if len(explanation) > 500:
	explanation = explanation[:497] + "..."
	translation = _s(translation_list, best_i)
	pronunciation = _s(pronun_list, best_i)
	return {
	"type": "structured",
	"gurmukhi": gurmukhi_list[best_i],
	"pronunciation": pronunciation,
	"pronunciation_lang": _detect_language(pronunciation),
	"translation": translation,
	"translation_lang": _detect_language(translation),
	"explanation": explanation,
	"explanation_lang": _detect_language(explanation),
	"ang": ang_list[best_i] if best_i < len(ang_list) else "",
	"total_lines": n,
	}
	except Exception:
	return {"type": "plain", "snippet": _best_snippet_window(text, query)}


	# ══════════════════════════════════════════════════════════════════════════════
	# 21. ANALYTICS
	# ══════════════════════════════════════════════════════════════════════════════

	def _log_query_async(
	query: str, category: str, mode: str,
	n_results: int, top_score: float, elapsed_ms: float,
	) -> None:
	def _write() -> None:
	try:
	ANALYTICS_DB.parent.mkdir(parents=True, exist_ok=True)
	conn = sqlite3.connect(str(ANALYTICS_DB), check_same_thread=False)
	conn.execute("""CREATE TABLE IF NOT EXISTS queries (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	ts TEXT NOT NULL, query_len INTEGER NOT NULL,
	category TEXT NOT NULL, mode TEXT NOT NULL,
	n_results INTEGER NOT NULL, top_score REAL NOT NULL,
	elapsed_ms REAL NOT NULL)""")
	conn.execute("INSERT INTO queries VALUES (NULL,?,?,?,?,?,?,?)",
	(time.strftime("%Y-%m-%dT%H:%M:%SZ"), len(query),
	category, mode, n_results, round(top_score, 4), round(elapsed_ms, 1)))
	conn.commit()
	conn.close()
	except Exception:
	pass
	threading.Thread(target=_write, daemon=True).start()


	# ══════════════════════════════════════════════════════════════════════════════
	# 22. RESEARCH MODE
	# ══════════════════════════════════════════════════════════════════════════════

	def _research(query: str, category_filter: str, source_filter: str) -> Tuple[str, str]:
	t_start = time.time()

	fused = _multi_query_search(
	query, category_filter, source_filter,
	k_per_subquery=max(RESEARCH_FAISS_K, RESEARCH_BM25_K),
	)
	total_candidates = len(fused)

	if not fused:
	msg = ("No relevant sources found.\n\n"
	"_Try broader keywords, remove filters, or check Gurmukhi spelling._")
	return msg, msg

	fused = _apply_source_cap(fused, source_filter)
	above = [h for h in fused if h["score"] >= RELEVANCE_THRESHOLD]
	candidates = above if len(above) >= MAX_RESEARCH_PASSAGES else fused[:MAX_RESEARCH_PASSAGES]

	anchor_vec = _embed(_expand_query(query))
	candidates = _mmr_rerank(anchor_vec, candidates, top_k=MAX_RESEARCH_PASSAGES)
	candidates = _calibrate_scores(candidates)

	ids = [h["idx"] for h in candidates]
	chunk_txts = _fetch_chunks_batch(ids)

	accessed = time.strftime("%B %d, %Y")
	tier_note = "" if FREE_TIER else " \u00b7 MMR diversity"
	unique_sources = len({h["meta"].get("display_name", "") for h in candidates})

	md: List[str] = []
	plain: List[str] = []

	md += [
	"## Research Results",
	(f"Showing {len(candidates)}* passages from {unique_sources} sources "
	f"\u00b7 {total_candidates:,} candidates \u00b7 hybrid BM25 + vector{tier_note}*"),
	"",
	("> Relevance % = how this passage ranks within these results "
	"(100% = most relevant passage in this query). "
	"Signal = percentile tier."),
	"", "---", "",
	]
	plain += [
	"RESEARCH RESULTS \u2014 SikhLibrary Digital Archive",
	"=" * 65,
	f"Passages shown : {len(candidates)}",
	f"Unique sources : {unique_sources}",
	f"Total candidates: {total_candidates:,}",
	f"Accessed : {accessed}",
	"=" * 65, "",
	]

	displayed = 0
	for i, h in enumerate(candidates, 1):
	idx = h["idx"]
	text = chunk_txts.get(idx, "")
	if not text:
	continue

	meta = h["meta"]
	display = meta.get("display_name", meta.get("file", "Unknown"))
	category = meta.get("category", "General")
	cat_disp = meta.get("category_display", category)
	section = meta.get("chunk_idx", 0) + 1
	relevance = h.get("relevance_pct", 0)
	tier = h.get("tier", "")
	passage = _extract_structured_passage(text, query)

	ang_link = None
	if passage["type"] == "structured" and passage.get("ang"):
	ang_link = f"https://www.sikhitothemax.org/ang?ang={passage['ang']}&source=G"
	else:
	ang_link = _ang_deep_link(text, category)

	chicago = (
	f'"{display}." In SikhLibrary Digital Archive, '
	f"section {section}, category: {cat_disp}. "
	f"Hugging Face Datasets. Accessed {accessed}. "
	f"https://huggingface.co/datasets/jsdosanj/SikhLibrary."
	)

	md += [f"### [{i}] {display} \u2014 Section {section}"]
	md += ["\| \| \|", "\|---\|---\|",
	f"\| Source \| {display} \|",
	f"\| Category \| {cat_disp} \|",
	f"\| Section \| {section} \|",
	f"\| Relevance \| {relevance}% \|",
	f"\| Signal \| {tier} \|", ""]

	if ang_link:
	ang_num = (passage.get("ang", "") or
	(m.group(1) if (m := _ANG_IN_TEXT_RE.search(text)) else ""))
	link_label = (f"View \u0a05\u0a70\u0a17 {ang_num} on SikhiToTheMax"
	if ang_num else "View on SikhiToTheMax")
	md += [f"[{link_label}]({ang_link})", ""]

	if passage["type"] == "structured":
	if passage["gurmukhi"]:
	md += ["Gurmukhi (Punjabi \u2014 Gurmukhi script):",
	f"> {passage['gurmukhi']}", ""]
	if passage["pronunciation"]:
	ll = passage["pronunciation_lang"]
	lbl = f"Pronunciation ({ll})" if ll else "Pronunciation"
	md += [f"{lbl}:", f"> {passage['pronunciation']}", ""]
	if passage["translation"]:
	ll = passage["translation_lang"]
	lbl = f"Translation ({ll})" if ll else "Translation"
	md += [f"{lbl}:", f"> {passage['translation']}", ""]
	if passage["explanation"]:
	ll = passage["explanation_lang"]
	lbl = f"Explanation ({ll})" if ll else "Explanation"
	md += [f"{lbl}:", f"> {passage['explanation']}", ""]
	if passage.get("total_lines", 1) > 1:
	md += [f"({passage['total_lines']} lines in this section)", ""]
	else:
	ll = _detect_language(passage["snippet"])
	lbl = f"Excerpt ({ll})" if ll else "Excerpt"
	md += [f"{lbl}:", f"> {passage['snippet']}", ""]

	md += ["Chicago Manual of Style (17th ed.):",
	f"> {chicago}", "", "---", ""]

	plain += [
	f"[{i}] {display} \u2014 Section {section}",
	f" Source : {display}",
	f" Category : {cat_disp}",
	f" Relevance : {relevance}%",
	f" Signal : {tier}",
	]
	if ang_link:
	plain += [f" SikhiToTheMax: {ang_link}"]
	if passage["type"] == "structured":
	if passage["gurmukhi"]:
	plain += [f" Gurmukhi: {passage['gurmukhi']}"]
	if passage["pronunciation"]:
	ll = passage["pronunciation_lang"]
	plain += [f" Pronunciation ({ll}): {passage['pronunciation']}"]
	if passage["translation"]:
	ll = passage["translation_lang"]
	plain += [f" Translation ({ll}): {passage['translation']}"]
	if passage["explanation"]:
	ll = passage["explanation_lang"]
	plain += [f" Explanation ({ll}): {passage['explanation']}"]
	else:
	ll = _detect_language(passage["snippet"])
	plain += [f" Excerpt ({ll}): {passage['snippet']}"]
	plain += [f" {chicago.replace('*', '')}", ""]
	displayed += 1

	plain += ["=" * 65,
	"Archive: https://huggingface.co/datasets/jsdosanj/SikhLibrary"]

	elapsed_ms = (time.time() - t_start) * 1000
	_log_query_async(query, category_filter, "Research",
	displayed, candidates[0]["score"] if candidates else 0.0, elapsed_ms)
	print(f"[research] {displayed} passages / {unique_sources} sources "
	f"in {elapsed_ms:.0f}ms BM25={_bm25_type}")
	return "\n".join(md), "\n".join(plain)


	def _make_download_file(plain_text: str) -> Optional[str]:
	if not plain_text.strip():
	return None
	try:
	tmp = tempfile.NamedTemporaryFile(
	mode="w", suffix=".txt", delete=False, encoding="utf-8",
	prefix="SikhLibrarian_research_",
	)
	tmp.write(plain_text)
	tmp.close()
	return tmp.name
	except Exception as e:
	print(f"\u26a0\ufe0f Temp file: {e}")
	return None


	# ══════════════════════════════════════════════════════════════════════════════
	# 23. HALLUCINATION VERIFICATION
	# ══════════════════════════════════════════════════════════════════════════════

	def _verify_ang_references(response: str, all_context: str) -> str:
	def _check(match: re.Match) -> str:
	ref = match.group(1)
	num_m = _ANG_NUM_RE.search(ref)
	if not num_m:
	return ref
	return ref if num_m.group(1) in all_context else f"~~{ref}~~ \u26a0\ufe0f"
	verified = _ANG_FULL_RE.sub(_check, response)
	if verified != response:
	verified += ("\n\n---\n> \u26a0\ufe0f *References shown as ~~struck through~~ could not "
	"be verified in the retrieved context. Please cross-check.*")
	return verified


	# ══════════════════════════════════════════════════════════════════════════════
	# 24. LEARN MODE
	# ══════════════════════════════════════════════════════════════════════════════

	_LEARN_SYSTEM_TEMPLATE = """\
	You are the AI Sikh Librarian — a world-class scholar of Sikh theology, history, and literature.
	Write at PhD academic depth in clear English that a motivated high-school student can follow.

	MANDATORY LANGUAGE RULES:
	- Main body: English only.
	- All key concepts / theological terms: Punjabi Unicode ONLY.
	Examples: ਸੰਤ-ਸਿਪਾਹੀ, ਨਾਮ ਸਿਮਰਨ, ਚੜ੍ਹਦੀ ਕਲਾ, ਵਾਹਿਗੁਰੂ, ਅਕਾਲ ਪੁਰਖ
	NEVER write English transliterations like "Waheguru", "Nam Simran", "Chardi Kala".
	- Cite Gurbani by ਅੰਗ number ONLY if that exact number appears verbatim in CONTEXT.

	ABSOLUTE PROHIBITIONS:
	- Do NOT quote any Gurbani not present verbatim in CONTEXT.
	- Do NOT fabricate ਅੰਗ numbers.
	- Do NOT cite sources not in CONTEXT.

	RESPONSE STRUCTURE:
	1. Introduction — 1-2 paragraphs
	2. Deep Analysis — cite every claim [Source N]; Punjabi Unicode for all key terms
	3. Cross-Source Synthesis
	4. Works Cited — Chicago 17th ed. (only sources listed below)

	RETRIEVED CONTEXT:
	{context}

	SOURCE METADATA:
	{citation_block}"""


	def _build_learn_context(hits: List[Dict], chunk_texts: Dict[int, str]) -> Tuple[str, str]:
	ctx_parts: List[str] = []
	cite_parts: List[str] = []
	seen_names: set = set()
	accessed = time.strftime("%B %d, %Y")
	for rank, h in enumerate(hits, 1):
	meta = h["meta"]
	idx = h["idx"]
	text = chunk_texts.get(idx, "").strip()
	if not text:
	continue
	trimmed = " ".join(text.split()[:MAX_CONTEXT_WORDS])
	display = meta.get("display_name", meta.get("file", "Unknown"))
	cat_disp = meta.get("category_display", meta.get("category", "General"))
	section = meta.get("chunk_idx", 0) + 1
	ctx_parts.append(f"[Source {rank}: {display}, Section {section}]\n{trimmed}")
	if display not in seen_names:
	seen_names.add(display)
	cite_parts.append(
	f'[Source {rank}] "{display}." In SikhLibrary Digital Archive, '
	f"section {section}, category: {cat_disp}. Hugging Face Datasets. "
	f"Accessed {accessed}. https://huggingface.co/datasets/jsdosanj/SikhLibrary."
	)
	return "\n\n---\n\n".join(ctx_parts), "\n".join(cite_parts)


	# ══════════════════════════════════════════════════════════════════════════════
	# 25. LLM STREAMING WITH RETRY
	# ══════════════════════════════════════════════════════════════════════════════

	def _is_retryable(err: str) -> bool:
	if "402" in err:
	return False
	el = err.lower()
	return ("429" in err or "503" in err or "502" in err or "rate" in el
	or "overloaded" in el or "timeout" in el or "connection" in el)


	def _stream_with_retry(messages: List[Dict], history: list) -> Generator:
	last_err = ""
	attempt = 0
	while attempt <= LLM_MAX_RETRIES:
	response_text = ""
	if attempt > 0:
	delay = LLM_RETRY_DELAYS[min(attempt - 1, len(LLM_RETRY_DELAYS) - 1)]
	history[-1]["content"] = (
	f"\u23f3 API busy (attempt {attempt}/{LLM_MAX_RETRIES}) \u2014 "
	f"retrying in {delay}s...\n\n"
	"Qwen2.5-72B free tier has limited concurrency."
	)
	yield history, False
	time.sleep(delay)
	try:
	for delta in _llm_client.chat_completion(
	messages=messages, max_tokens=LLM_MAX_TOKENS,
	stream=True, temperature=LLM_TEMPERATURE, top_p=LLM_TOP_P,
	):
	if not delta.choices:
	continue
	token = delta.choices[0].delta.content
	if token:
	response_text += token
	history[-1]["content"] = response_text
	yield history, False
	yield history, True
	return
	except Exception as e:
	err = str(e)
	if response_text.strip():
	print(f"Post-stream exception (response preserved): {err}")
	yield history, True
	return
	last_err = err
	print(f"LLM attempt {attempt+1} failed: {err}")
	if _is_retryable(err) and attempt < LLM_MAX_RETRIES:
	attempt += 1
	continue
	break

	if "402" in last_err:
	msg = ("\U0001f4b3 Learn mode temporarily unavailable \u2014 monthly credits used up.\n\n"
	"Research mode is fully available for instant citations.")
	elif "429" in last_err or "rate" in last_err.lower():
	msg = (f"\u23f3 HF Inference API overloaded. Tried {LLM_MAX_RETRIES+1} times. "
	"Please wait 1-2 min or switch to Research mode.")
	elif "401" in last_err or "403" in last_err:
	msg = "\U0001f511 Auth error. Check `HF_TOKEN` in Space Settings > Secrets."
	else:
	msg = "\u26a0\ufe0f LLM unavailable after retries. Try again or use Research mode."
	print(f"LLM final error: {last_err}")
	history[-1]["content"] = msg
	yield history, True


	# ══════════════════════════════════════════════════════════════════════════════
	# 26. GRADIO EVENT HANDLERS
	# ══════════════════════════════════════════════════════════════════════════════

	def _not_ready_msg() -> str:
	return (f"\u23f3 Index still loading (happens once on first startup).\n\n"
	f"Status: `{_index_status.get('state', 'loading')}`\n"
	f"Progress: {_index_status.get('progress', '...')}\n\n"
	"Please try again in a moment.")


	def get_source_choices() -> gr.update:
	_index_ready.wait(timeout=300)
	with _source_facets_lock:
	choices = list(_source_facets)
	return gr.update(choices=choices, value="All Sources")


	def submit_query(
	message: str, history: list, mode: str, category: str, source_filter: str,
	) -> Generator:
	clean = _sanitize(message)
	if len(clean) < MIN_QUERY_LEN:
	history = history + [
	{"role": "user", "content": message},
	{"role": "assistant", "content": "Please enter at least 3 characters."},
	]
	yield history, "", gr.update(visible=False)
	return

	if not _index_ready.is_set():
	history = history + [
	{"role": "user", "content": message},
	{"role": "assistant", "content": _not_ready_msg()},
	]
	yield history, "", gr.update(visible=False)
	return

	if _index_status["state"] == "error":
	history = history + [
	{"role": "user", "content": message},
	{"role": "assistant", "content": "\u274c Index unavailable \u2014 check Space logs."},
	]
	yield history, "", gr.update(visible=False)
	return

	history = history + [{"role": "user", "content": message}]
	yield history, "", gr.update(visible=False)

	if mode == "Research":
	try:
	md_out, plain_out = _research(clean, category, source_filter)
	except Exception:
	import traceback; traceback.print_exc()
	md_out, plain_out = "\u26a0\ufe0f Retrieval error \u2014 please try again.", ""
	history = history + [{"role": "assistant", "content": md_out}]
	dl_path = _make_download_file(plain_out)
	yield history, plain_out, gr.update(visible=bool(dl_path), value=dl_path)
	return

	if _sqlite_path is None:
	history = history + [{"role": "assistant", "content":
	"\u26a0\ufe0f Learn mode requires doc_store.sqlite \u2014 not found.\n\n"
	"Research mode is fully available."}]
	yield history, "", gr.update(visible=False)
	return

	try:
	expanded = _expand_query(clean)
	q_vec = _embed(expanded)
	hits = _faiss_search(q_vec, category, source_filter, LEARN_FAISS_K)[:LEARN_MODE_K]
	ids = [h["idx"] for h in hits]
	chunk_txts = _fetch_chunks_batch(ids)
	ctx_str, cite_block = _build_learn_context(hits, chunk_txts)
	all_context = " ".join(chunk_txts.values())
	except Exception:
	import traceback; traceback.print_exc()
	history = history + [{"role": "assistant", "content":
	"\u26a0\ufe0f Retrieval error \u2014 please try again."}]
	yield history, "", gr.update(visible=False)
	return

	if not ctx_str.strip():
	history = history + [{"role": "assistant", "content":
	"No relevant sources found.\n\n"
	"_Try different keywords or switch to Research mode._"}]
	yield history, "", gr.update(visible=False)
	return

	system_msg = _LEARN_SYSTEM_TEMPLATE.format(context=ctx_str, citation_block=cite_block)
	prior = history[:-1]
	recent = prior[-6:] if len(prior) > 6 else prior
	messages = [{"role": "system", "content": system_msg}]
	messages.extend(recent)
	messages.append({"role": "user", "content": clean})

	history = history + [{"role": "assistant", "content": "\u23f3 Thinking..."}]
	final_history = history

	for final_history, is_done in _stream_with_retry(messages, history):
	yield final_history, "", gr.update(visible=False)
	if is_done:
	break

	final_text = final_history[-1]["content"]
	if final_text and not final_text.startswith(("\u23f3", "\u26a0\ufe0f", "\U0001f511", "\U0001f4b3")):
	final_history[-1]["content"] = _verify_ang_references(final_text, all_context)
	yield final_history, "", gr.update(visible=False)

	stats = _embed_cache.stats
	print(f"[cache] {stats['hit_rate']} \u00b7 {stats['size']}/{stats['capacity']}")
	_log_query_async(clean, category, "Learn",
	len(hits), hits[0]["score"] if hits else 0.0, 0.0)


	def clear_conversation() -> Tuple:
	return [], "", gr.update(visible=False, value=None)


	def toggle_mode_ui(mode: str) -> Tuple:
	is_research = mode == "Research"
	placeholder = (
	"Search every passage across 758M+ words... "
	"(e.g. 'naam simran', 'seva', 'Khalsa', 'waheguru')"
	if is_research
	else "Message Sikh Librarian\u2026"
	)
	return (
	gr.update(visible=is_research),
	gr.update(visible=False, value=None),
	gr.update(placeholder=placeholder),
	gr.update(visible=is_research),
	)


	# ══════════════════════════════════════════════════════════════════════════════
	# 27. DESIGN SYSTEM — "Midnight Archive"
	# ══════════════════════════════════════════════════════════════════════════════

	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');

	@keyframes ma-slide-up {
	from { opacity: 0; transform: translateY(12px) scale(0.97); }
	to { opacity: 1; transform: translateY(0) scale(1); }
	}
	@keyframes ma-pulse-glow {
	0%, 100% { box-shadow: 0 0 6px rgba(52,199,89,0.45); }
	50% { box-shadow: 0 0 14px rgba(52,199,89,0.80); }
	}

	:root {
	--ma-bg: #0b0b0f;
	--ma-surface: #13141a;
	--ma-surface2: #1a1b22;
	--ma-surface3: #202129;
	--ma-border: rgba(255,255,255,0.08);
	--ma-border-focus: rgba(0,122,255,0.70);
	--ma-separator: #2c2c2e;
	--ma-text: #e2e2e7;
	--ma-text2: #98989f;
	--ma-text3: #5c5c61;
	--ma-blue: #007AFF;
	--ma-blue-dim: rgba(0,122,255,0.15);
	--ma-bubble-user: #007AFF;
	--ma-bubble-bot: #1c1d22;
	--ma-glass-bg: rgba(30,31,40,0.75);
	--ma-glass-bd: rgba(255,255,255,0.10);
	--ma-hero-bg: linear-gradient(160deg,#07080e 0%,#0e1124 55%,#060810 100%);
	--ma-r: 24px;
	--ma-r-sm: 14px;
	--ma-r-xs: 8px;
	--ma-r-pill: 999px;
	--ma-shadow: 0 4px 32px rgba(0,0,0,0.70);
	--ma-shadow-hero: 0 8px 56px rgba(0,0,0,0.85);
	--ma-shadow-card: 0 2px 24px rgba(0,0,0,0.60);
	--ma-shadow-input: 0 10px 30px -10px rgba(0,0,0,0.60);
	--ma-shadow-btn: 0 4px 18px rgba(0,122,255,0.45);
	}

	/* hide legacy toggles / Gradio footer */
	#theme-toggle,#sl-theme-btn,.show-api,.built-with,
	footer,.gradio-container>.footer,.svelte-1ipelgc { display:none !important; }

	html,body,.gradio-container,#root {
	background: var(--ma-bg) !important;
	color: var(--ma-text) !important;
	font-family: 'SF Pro Display','SF Pro Text',-apple-system,BlinkMacSystemFont,
	'Inter','Helvetica Neue',Arial,sans-serif !important;
	font-size: 16px; line-height: 1.5;
	-webkit-font-smoothing: antialiased;
	}
	.gradio-container {
	max-width: 860px !important;
	margin: 0 auto !important;
	padding: 20px 16px 56px !important;
	}
	.gradio-container * { color: var(--ma-text) !important; }

	/* ── Hero ─────────────────────────────────────────────────────── */
	.ma-hero {
	background: var(--ma-hero-bg) !important;
	border: 1px solid var(--ma-border); border-radius: var(--ma-r);
	padding: 52px 40px 40px; margin-bottom: 16px;
	text-align: center; box-shadow: var(--ma-shadow-hero);
	position: relative; overflow: hidden;
	}
	.ma-hero::before {
	content:''; position:absolute; inset:0;
	background: radial-gradient(ellipse 80% 50% at 50% -10%,rgba(0,122,255,0.22) 0%,transparent 65%);
	pointer-events: none;
	}
	.ma-hero h1 {
	font-size: 2.5rem !important; font-weight: 800 !important;
	letter-spacing: -0.03em !important; color: #ffffff !important;
	margin-bottom: 10px; position: relative;
	}
	.ma-desc {
	font-size: 0.97rem; color: rgba(226,226,231,0.65) !important;
	line-height: 1.75; margin-bottom: 26px; position: relative;
	}
	.ma-desc strong { color: #ffffff !important; }

	/* ── Badges ───────────────────────────────────────────────────── */
	.ma-badges { display:flex; gap:8px; justify-content:center; flex-wrap:wrap; position:relative; }
	.ma-badge {
	display:inline-flex; align-items:center; gap:6px;
	padding:6px 16px; border-radius:var(--ma-r-pill);
	font-size:0.68rem; font-weight:700; text-transform:uppercase; letter-spacing:0.10em;
	border:1px solid; backdrop-filter:blur(8px); -webkit-backdrop-filter:blur(8px);
	}
	.ma-badge-online {
	background:rgba(52,199,89,0.14) !important; color:#34c759 !important;
	border-color:rgba(52,199,89,0.30); animation:ma-pulse-glow 2.8s ease-in-out infinite;
	}
	.ma-badge-learn {
	background:rgba(0,122,255,0.14) !important; color:#5ac8fa !important;
	border-color:rgba(0,122,255,0.28);
	}
	.ma-badge-research {
	background:rgba(255,159,10,0.12) !important; color:#ff9f0a !important;
	border-color:rgba(255,159,10,0.28);
	}

	/* ── Cards ────────────────────────────────────────────────────── */
	.gradio-container .gradio-group,
	.gradio-container .gradio-box,
	.gradio-container .block,
	.gradio-container .panel,
	.gradio-container form {
	background:var(--ma-surface) !important;
	border:1px solid var(--ma-border) !important;
	border-radius:var(--ma-r) !important;
	box-shadow:var(--ma-shadow-card) !important;
	transition:background 0.25s;
	}
	.gradio-container .gradio-group .gradio-group,
	.gradio-container .block .block {
	background:var(--ma-surface2) !important; border-radius:var(--ma-r-sm) !important;
	}

	/* ── Chatbot ──────────────────────────────────────────────────── */
	.gradio-chatbot,.gradio-chatbot>div {
	background:var(--ma-surface) !important;
	border:1px solid var(--ma-border) !important;
	border-radius:var(--ma-r) !important;
	box-shadow:var(--ma-shadow-card) !important;
	}
	.gradio-chatbot .message-wrap,.gradio-chatbot .messages {
	display:flex !important; flex-direction:column !important;
	gap:4px !important; padding:16px !important;
	}
	.gradio-chatbot .message,[data-testid="user"],[data-testid="bot"] {
	animation:ma-slide-up 0.28s cubic-bezier(0.34,1.20,0.64,1) both !important;
	border:none !important; padding:12px 18px !important;
	font-size:16px !important; line-height:1.45 !important;
	margin-bottom:2px !important; max-width:85% !important;
	position:relative; word-break:break-word;
	}
	.gradio-chatbot .message.user,[data-testid="user"] {
	background:var(--ma-bubble-user) !important; color:#ffffff !important;
	border-radius:20px 20px 4px 20px !important;
	align-self:flex-end !important; margin-left:auto !important;
	box-shadow:0 2px 14px rgba(0,122,255,0.40) !important;
	}
	.gradio-chatbot .message.user ,[data-testid="user"] { color:#ffffff !important; }
	.gradio-chatbot .message.bot,[data-testid="bot"] {
	background:var(--ma-bubble-bot) !important; color:var(--ma-text) !important;
	border-radius:20px 20px 20px 4px !important;
	align-self:flex-start !important; margin-right:auto !important;
	border:1px solid var(--ma-border) !important;
	box-shadow:0 2px 12px rgba(0,0,0,0.35) !important;
	}
	.gradio-chatbot .message.bot ,[data-testid="bot"] { color:var(--ma-text) !important; }

	/* blockquotes */
	.gradio-chatbot .message.bot blockquote,[data-testid="bot"] blockquote {
	border-left:3px solid var(--ma-blue) !important;
	padding:10px 16px !important; margin:10px 0 !important;
	background:var(--ma-blue-dim) !important;
	border-radius:0 var(--ma-r-xs) var(--ma-r-xs) 0 !important;
	}
	.gradio-chatbot .message.bot blockquote ,[data-testid="bot"] blockquote {
	color:var(--ma-text) !important;
	}

	/* tables — no vertical borders */
	.gradio-chatbot .message.bot table,[data-testid="bot"] table {
	border-collapse:collapse !important; width:100% !important;
	margin:12px 0 !important; border:none !important;
	}
	.gradio-chatbot .message.bot th,[data-testid="bot"] th {
	background:var(--ma-surface3) !important; color:var(--ma-text2) !important;
	padding:9px 14px !important; font-weight:600 !important; font-size:0.78em !important;
	text-transform:uppercase; letter-spacing:0.07em;
	border:none !important; border-bottom:1px solid var(--ma-separator) !important;
	}
	.gradio-chatbot .message.bot td,[data-testid="bot"] td {
	background:transparent !important; color:var(--ma-text) !important;
	padding:9px 14px !important;
	border:none !important; border-bottom:1px solid var(--ma-separator) !important;
	font-size:0.93em !important;
	}
	.gradio-chatbot .message.bot tr:last-child td,[data-testid="bot"] tr:last-child td {
	border-bottom:none !important;
	}
	.gradio-chatbot .message.bot th ,.gradio-chatbot .message.bot td ,
	[data-testid="bot"] th ,[data-testid="bot"] td { color:var(--ma-text) !important; }

	/* code */
	.gradio-chatbot .message.bot code,[data-testid="bot"] code {
	background:var(--ma-surface3) !important; color:var(--ma-text) !important;
	padding:2px 7px !important; border-radius:var(--ma-r-xs) !important;
	font-size:0.85em !important; border:1px solid var(--ma-border) !important;
	}
	.gradio-chatbot .message.bot pre,[data-testid="bot"] pre {
	background:#0b0b0f !important; border:1px solid var(--ma-border) !important;
	border-radius:var(--ma-r-sm) !important; padding:16px !important; margin:10px 0 !important;
	}
	.gradio-chatbot .message.bot pre code,[data-testid="bot"] pre code {
	background:transparent !important; border:none !important;
	}
	.gradio-chatbot .message.bot a,[data-testid="bot"] a { color:#5ac8fa !important; }
	.gradio-chatbot .message.bot hr,[data-testid="bot"] hr {
	border:none !important; border-top:1px solid var(--ma-separator) !important; margin:14px 0 !important;
	}

	/* ── Input bar ────────────────────────────────────────────────── */
	textarea,input[type="text"],input[type="search"] {
	background:var(--ma-surface2) !important;
	border:1.5px solid var(--ma-border) !important;
	border-radius:var(--ma-r-pill) !important;
	color:var(--ma-text) !important;
	font-family:inherit !important; font-size:16px !important; padding:14px 24px !important;
	caret-color:var(--ma-blue) !important;
	box-shadow:var(--ma-shadow-input) !important;
	transition:border-color 0.25s cubic-bezier(0.4,0,0.2,1),
	box-shadow 0.25s cubic-bezier(0.4,0,0.2,1) !important;
	}
	textarea:focus,input[type="text"]:focus,input[type="search"]:focus {
	border-color:var(--ma-border-focus) !important;
	box-shadow:0 0 0 3px rgba(0,122,255,0.20),var(--ma-shadow-input) !important;
	outline:none !important;
	}
	textarea::placeholder,input::placeholder { color:var(--ma-text3) !important; }

	/* ── Labels ───────────────────────────────────────────────────── */
	label,.label-wrap,.block-label,.form-label,
	.gradio-radio label,.gradio-radio span {
	color:var(--ma-text2) !important;
	font-weight:600; font-size:0.82rem; text-transform:uppercase; letter-spacing:0.07em;
	}

	/* ── Dropdowns ────────────────────────────────────────────────── */
	.gradio-dropdown,[data-testid="dropdown"] {
	background:var(--ma-glass-bg) !important;
	border:1px solid var(--ma-glass-bd) !important;
	border-radius:var(--ma-r-pill) !important;
	backdrop-filter:blur(12px) !important; -webkit-backdrop-filter:blur(12px) !important;
	color:var(--ma-text) !important; transition:border-color 0.2s;
	}
	.gradio-dropdown:focus-within,[data-testid="dropdown"]:focus-within {
	border-color:var(--ma-border-focus) !important;
	box-shadow:0 0 0 3px rgba(0,122,255,0.15) !important;
	}
	.gradio-dropdown ,[data-testid="dropdown"] {
	background:transparent !important; color:var(--ma-text) !important;
	}
	.gradio-dropdown>div,[data-testid="dropdown"]>div {
	background:var(--ma-glass-bg) !important; border-radius:var(--ma-r-pill) !important;
	}
	[role="listbox"],.gradio-dropdown ul,.gradio-dropdown .options {
	background:var(--ma-surface) !important;
	border:1px solid var(--ma-border) !important;
	border-radius:var(--ma-r-sm) !important;
	box-shadow:var(--ma-shadow) !important;
	backdrop-filter:blur(20px) !important; -webkit-backdrop-filter:blur(20px) !important;
	overflow:hidden;
	}
	[role="option"],.gradio-dropdown li {
	background:transparent !important; color:var(--ma-text) !important;
	padding:11px 20px !important; font-size:0.95rem; cursor:pointer;
	border-bottom:1px solid var(--ma-separator) !important; transition:background 0.12s;
	}
	[role="option"]:last-child,.gradio-dropdown li:last-child { border-bottom:none !important; }
	[role="option"]:hover,.gradio-dropdown li:hover,
	[role="option"][aria-selected="true"] { background:var(--ma-surface3) !important; }

	/* ── Radio ────────────────────────────────────────────────────── */
	.gradio-radio span,.gradio-radio label { color:var(--ma-text) !important; }

	/* ── Buttons ──────────────────────────────────────────────────── */
	button.primary,.gr-button-primary,[data-testid="submit-btn"] {
	border-radius:var(--ma-r-pill) !important; padding:14px 28px !important;
	background:var(--ma-blue) !important; border:none !important;
	color:#ffffff !important; font-family:inherit !important;
	font-weight:600 !important; font-size:16px !important;
	box-shadow:var(--ma-shadow-btn) !important;
	transition:transform 0.2s cubic-bezier(0.4,0,0.2,1),
	filter 0.2s cubic-bezier(0.4,0,0.2,1) !important;
	}
	button.primary:hover,.gr-button-primary:hover {
	transform:scale(1.02) !important; filter:brightness(1.10) !important;
	}
	button.primary:active { transform:scale(0.97) !important; }
	button.primary * { color:#ffffff !important; }

	button.secondary,.gr-button-secondary {
	border-radius:var(--ma-r-pill) !important; padding:12px 24px !important;
	background:var(--ma-glass-bg) !important;
	border:1px solid var(--ma-glass-bd) !important;
	color:var(--ma-text) !important; font-family:inherit !important; font-weight:500 !important;
	backdrop-filter:blur(12px) !important; -webkit-backdrop-filter:blur(12px) !important;
	transition:background 0.2s cubic-bezier(0.4,0,0.2,1),
	transform 0.2s cubic-bezier(0.4,0,0.2,1) !important;
	}
	button.secondary:hover {
	background:var(--ma-surface3) !important; transform:scale(1.02) !important;
	}
	button.secondary:active { transform:scale(0.98) !important; }
	button.secondary * { color:var(--ma-text) !important; }

	/* ── Examples table ───────────────────────────────────────────── */
	.gradio-dataframe,.gradio-dataset {
	background:var(--ma-surface) !important;
	border:1px solid var(--ma-border) !important;
	border-radius:var(--ma-r) !important;
	overflow:hidden !important; box-shadow:var(--ma-shadow-card) !important;
	}
	.gradio-dataframe label,.gradio-dataset label,
	.gradio-dataframe .label,.gradio-dataset .label {
	color:var(--ma-text2) !important; font-weight:700; font-size:0.78rem;
	text-transform:uppercase; letter-spacing:0.08em;
	padding:14px 20px 8px !important; display:block;
	}
	.gradio-dataframe table thead tr th,
	.gradio-dataset table thead tr th {
	background:var(--ma-surface2) !important; color:var(--ma-text2) !important;
	font-weight:600 !important; font-size:0.76rem !important;
	padding:10px 16px !important;
	border:none !important; border-bottom:1px solid var(--ma-separator) !important;
	text-transform:uppercase; letter-spacing:0.06em; white-space:nowrap;
	}
	.gradio-dataframe table tbody tr td,
	.gradio-dataset table tbody tr td {
	background:transparent !important; color:var(--ma-text) !important;
	padding:10px 16px !important;
	border:none !important; border-bottom:1px solid var(--ma-separator) !important;
	font-size:0.93rem !important;
	}
	.gradio-dataframe table tbody tr:last-child td,
	.gradio-dataset table tbody tr:last-child td { border-bottom:none !important; }
	.gradio-dataframe table tbody tr:hover td,
	.gradio-dataset table tbody tr:hover td {
	background:var(--ma-surface3) !important; cursor:pointer;
	}
	.gradio-dataframe table ,.gradio-dataset table { color:var(--ma-text) !important; }
	.gradio-dataframe table,.gradio-dataset table {
	border-collapse:collapse !important; width:100% !important;
	}

	/* ── Mono export box ──────────────────────────────────────────── */
	.plain-text-box textarea {
	font-family:'SF Mono','Fira Code','JetBrains Mono',monospace !important;
	font-size:12px !important; line-height:1.65 !important;
	background:#0b0b0f !important; border-radius:var(--ma-r-sm) !important;
	color:var(--ma-text) !important; border-color:var(--ma-border) !important;
	}

	/* ── Markdown ─────────────────────────────────────────────────── */
	.gradio-markdown,.gradio-markdown * { color:var(--ma-text) !important; }
	.gradio-markdown a { color:#5ac8fa !important; }
	.gradio-markdown code {
	background:var(--ma-surface2) !important; color:var(--ma-text) !important;
	border:1px solid var(--ma-border) !important; padding:2px 7px; border-radius:var(--ma-r-xs);
	}
	.gradio-container .info,.gradio-container .description {
	color:var(--ma-text3) !important; font-size:0.83rem;
	}
	.gradio-container h3,.gradio-container h4 {
	font-weight:600 !important; color:var(--ma-text) !important; letter-spacing:-0.01em;
	}

	/* ── Footer ───────────────────────────────────────────────────── */
	.ma-footer {
	text-align:center; padding:24px 0 8px;
	font-size:0.78rem; color:var(--ma-text3) !important;
	}
	.ma-footer a { color:var(--ma-text2) !important; text-decoration:none; }
	.ma-footer a:hover { color:#5ac8fa !important; }

	/* ── Scrollbars ───────────────────────────────────────────────── */
	::-webkit-scrollbar { width:6px; height:6px; }
	::-webkit-scrollbar-track { background:transparent; }
	::-webkit-scrollbar-thumb { background:#3a3a3c; border-radius:10px; }
	::-webkit-scrollbar-thumb:hover { background:#636366; }
	"""


	# ══════════════════════════════════════════════════════════════════════════════
	# 28. GRADIO THEME
	# FIX: use plain strings only in font= list.
	# Mixing gr.themes.GoogleFont objects with plain strings triggers
	# AttributeError: 'str' object has no attribute 'name' in Gradio 6.
	# ══════════════════════════════════════════════════════════════════════════════

	_GRADIO_THEME = gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="slate",
	neutral_hue="slate",
	font=[
	"SF Pro Display",
	"-apple-system",
	"BlinkMacSystemFont",
	"Inter",
	"sans-serif",
	],
	font_mono=[
	"SF Mono",
	"Fira Code",
	"monospace",
	],
	).set(
	body_text_size="16px",
	body_background_fill="#0b0b0f",
	body_text_color="#e2e2e7",
	button_primary_background_fill="#007AFF",
	button_primary_background_fill_hover="#0071e3",
	button_primary_text_color="#ffffff",
	input_background_fill="#1a1b22",
	input_border_color="rgba(255,255,255,0.08)",
	input_border_width="1.5px",
	block_background_fill="#13141a",
	block_border_color="rgba(255,255,255,0.08)",
	block_border_width="1px",
	block_radius="24px",
	block_shadow="0 4px 32px rgba(0,0,0,0.70)",
	section_header_text_weight="600",
	)


	# ══════════════════════════════════════════════════════════════════════════════
	# 29. UI BUILDER
	# Gradio 6 requirements:
	# - theme + css passed to launch(), NOT gr.Blocks()
	# - bubble_full_width removed from gr.Chatbot()
	# - font list uses plain strings only (no GoogleFont mix)
	# ══════════════════════════════════════════════════════════════════════════════

	def _create_app() -> gr.Blocks:
	with gr.Blocks(title="AI Sikh Librarian") as demo:

	with gr.Column(elem_id="main-content"):

	gr.HTML("""
	<div class="ma-hero">
	<h1>📚 AI Sikh Librarian</h1>
	<p class="ma-desc">
	Scholarly research tool  ·
	758M+ words of Sikh scriptures, history & manuscripts<br>
	<strong>Qwen2.5-72B</strong>  ·
	Hybrid BM25 + FAISS  ·
	Every passage  ·  Chicago citations
	</p>
	<div class="ma-badges">
	<span class="ma-badge ma-badge-online">●  Online</span>
	<span class="ma-badge ma-badge-learn">📖  Learn Mode</span>
	<span class="ma-badge ma-badge-research">🔍  Research Mode</span>
	</div>
	</div>
	""")

	with gr.Row(equal_height=True):
	mode_radio = gr.Radio(
	choices=["Learn", "Research"],
	value="Learn",
	label="Mode",
	info="Learn: Qwen2.5-72B scholarly Q&A \| Research: every passage, no LLM",
	)
	category_dd = gr.Dropdown(
	choices=CATEGORY_OPTIONS,
	value="All",
	label="Filter by Category",
	)

	with _source_facets_lock:
	initial_sources = list(_source_facets)

	source_dd = gr.Dropdown(
	choices=initial_sources,
	value="All Sources",
	label="Filter by Source (granth / manuscript)",
	visible=False,
	)

	chatbot = gr.Chatbot(
	value=[],
	height=640,
	show_label=False,
	render_markdown=True,
	)

	with gr.Row():
	query_box = gr.Textbox(
	placeholder="Message Sikh Librarian\u2026",
	show_label=False,
	container=False,
	scale=9,
	lines=1,
	max_lines=5,
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1, min_width=90)

	clear_btn = gr.Button("Clear conversation", variant="secondary", size="sm")

	with gr.Group(visible=False) as export_group:
	gr.Markdown("### Export Results")
	plain_text_box = gr.Textbox(
	label="Plain text \u2014 copy or download",
	lines=12, interactive=False,
	elem_classes=["plain-text-box"],
	)
	download_btn = gr.DownloadButton(
	label="Download .txt", visible=False,
	variant="secondary", size="sm",
	)

	gr.Examples(
	examples=[
	["Learn", "All", "All Sources",
	"What does Guru Granth Sahib teach about mental health and inner peace?"],
	["Learn", "All", "All Sources",
	"Explain the concept of sant-sipahi in Sikh thought"],
	["Learn", "Gurbani", "All Sources",
	"What is the significance of amrit vela in Gurbani?"],
	["Research", "All", "All Sources", "naam simran"],
	["Research", "Gurbani", "All Sources", "waheguru naam simran meditation"],
	["Research", "Granths", "All Sources", "Khalsa identity sovereignty"],
	["Research", "Steeks", "All Sources", "japji sahib commentary"],
	["Research", "Literature", "All Sources", "martyrdom sacrifice shaheedi"],
	],
	inputs=[mode_radio, category_dd, source_dd, query_box],
	label="Example queries \u2014 click to load, then press Send",
	)

	gr.HTML("""
	<div class="ma-footer">
	<a href="https://huggingface.co/datasets/jsdosanj/SikhLibrary"
	target="_blank" rel="noopener noreferrer">SikhLibrary Digital Archive</a>
	·  Qwen2.5-72B-Instruct  ·
	Hybrid BM25 + FAISS  ·
	Chicago Manual of Style 17th ed.  ·  CC BY-NC-ND 4.0
	</div>
	""")

	sub_in = [query_box, chatbot, mode_radio, category_dd, source_dd]
	sub_out = [chatbot, plain_text_box, download_btn]

	query_box.submit(
	fn=submit_query, inputs=sub_in, outputs=sub_out,
	).then(fn=lambda: "", outputs=query_box)

	submit_btn.click(
	fn=submit_query, inputs=sub_in, outputs=sub_out,
	).then(fn=lambda: "", outputs=query_box)

	clear_btn.click(
	fn=clear_conversation,
	outputs=[chatbot, plain_text_box, download_btn],
	)

	mode_radio.change(
	fn=toggle_mode_ui,
	inputs=mode_radio,
	outputs=[export_group, download_btn, query_box, source_dd],
	)

	demo.load(fn=get_source_choices, outputs=source_dd)

	return demo


	# ══════════════════════════════════════════════════════════════════════════════
	# 30. STARTUP
	# ══════════════════════════════════════════════════════════════════════════════

	print("\U0001f680 Starting AI Sikh Librarian v22 \u2014 Midnight Archive Edition")
	print(f" FREE_TIER : {FREE_TIER}")
	print(f" MMR reranking : {'disabled' if FREE_TIER else 'enabled (CPU Upgrade)'}")
	print(f" Per-source cap : {PER_SOURCE_CAP} passages")
	print(f" langdetect : {'available' if _LANGDETECT_AVAILABLE else 'not installed'}")
	print(f" EMBED_CACHE_SIZE : {EMBED_CACHE_SIZE:,}")

	_init_llm_client()

	demo = _create_app()
	demo.queue(
	max_size=100 if not FREE_TIER else 20,
	default_concurrency_limit=1,
	)

	_index_thread = threading.Thread(target=_load_index_background, daemon=True)
	_index_thread.start()

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	ssr_mode=False,
	show_error=False,
	max_threads=40 if not FREE_TIER else 8,
	theme=_GRADIO_THEME, # Gradio 6: theme in launch(), not gr.Blocks()
	css=CUSTOM_CSS, # Gradio 6: css in launch(), not gr.Blocks()
	)