Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| app.py — AI Sikh Librarian v22 | |
| ================================ | |
| "Midnight Archive" — iOS 26 Glassmorphism redesign | |
| Gradio 6 fixes applied: | |
| - theme + css passed to launch(), not gr.Blocks() | |
| - bubble_full_width removed from gr.Chatbot() | |
| - font= list uses plain strings only (no GoogleFont mix — causes AttributeError) | |
| """ | |
| import gc, html, json, os, pickle, re, sqlite3, tempfile, threading, time | |
| from collections import OrderedDict | |
| from pathlib import Path | |
| from typing import Dict, Generator, List, Optional, Tuple | |
| import bm25s as bm25s_lib | |
| import faiss | |
| import gradio as gr | |
| import numpy as np | |
| from huggingface_hub import InferenceClient, hf_hub_download | |
| from rank_bm25 import BM25Okapi | |
| from sentence_transformers import SentenceTransformer | |
| try: | |
| from langdetect import detect as _langdetect | |
| from langdetect.lang_detect_exception import LangDetectException | |
| _LANGDETECT_AVAILABLE = True | |
| except ImportError: | |
| _LANGDETECT_AVAILABLE = False | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 1. CONFIGURATION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| FREE_TIER = os.environ.get("FREE_TIER", "false").lower() == "true" | |
| LLM_MODEL = "Qwen/Qwen2.5-72B-Instruct" | |
| EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| STORAGE_REPO = "jsdosanj/SikhLibrarian-storage" | |
| STORAGE_SUBDIR = "index_output_v2" | |
| RELEVANCE_THRESHOLD = 0.40 | |
| RESEARCH_FAISS_K = 600 | |
| RESEARCH_BM25_K = 600 | |
| MAX_RESEARCH_PASSAGES = 100 | |
| PER_SOURCE_CAP = 15 | |
| RRF_K = 60 | |
| MMR_LAMBDA = 0.7 | |
| MMR_TOP_K = 120 | |
| LEARN_FAISS_K = 20 | |
| LEARN_MODE_K = 5 | |
| MAX_CONTEXT_WORDS = 1_400 | |
| SNIPPET_WORDS = 80 | |
| SNIPPET_STEP = 5 | |
| MAX_QUERY_LEN = 500 | |
| MIN_QUERY_LEN = 3 | |
| MAX_SUBQUERIES = 3 | |
| LLM_MAX_TOKENS = 2_048 | |
| LLM_TEMPERATURE = 0.15 | |
| LLM_TOP_P = 0.9 | |
| LLM_MAX_RETRIES = 3 | |
| LLM_RETRY_DELAYS = [5, 15, 30] | |
| EMBED_CACHE_SIZE = 10_000 | |
| BUCKET_DIR = Path("/data") / STORAGE_SUBDIR | |
| CACHE_DIR = Path("./index_cache") | |
| INDEX_FILE = "faiss.index" | |
| SQLITE_FILE = "doc_store.sqlite" | |
| META_FILE = "meta.json" | |
| BM25_FILE = "bm25.pkl" | |
| ANALYTICS_DB = Path("/data/query_log.sqlite") | |
| CATEGORY_OPTIONS = ["All", "Gurbani", "Granths", "Steeks", "Literature", "Research"] | |
| KNOWN_SOURCES: List[str] = [ | |
| "\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40 (112 \u0a2d\u0a3e\u0a38\u0a3c\u0a3e\u0a35\u0a3e\u0a02)", | |
| "\u0a38\u0a4d\u0a30\u0a40 \u0a26\u0a38\u0a2e \u0a17\u0a4d\u0a30\u0a70\u0a25", | |
| "\u0a38\u0a4d\u0a30\u0a40 \u0a38\u0a30\u0a2c\u0a32\u0a4b\u0a39 \u0a17\u0a4d\u0a30\u0a70\u0a25", | |
| "\u0a2d\u0a3e\u0a08 \u0a17\u0a41\u0a30\u0a26\u0a3e\u0a38 \u0a1c\u0a40 \u0a26\u0a40\u0a06\u0a02 \u0a35\u0a3e\u0a30\u0a3e\u0a02", | |
| "\u0a2d\u0a3e\u0a08 \u0a17\u0a41\u0a30\u0a26\u0a3e\u0a38 \u0a38\u0a3f\u0a70\u0a18 \u0a1c\u0a40 \u0a26\u0a40\u0a06\u0a02 \u0a15\u0a2c\u0a3f\u0a71\u0a24 \u0a38\u0a35\u0a71\u0a0d\u0a0f", | |
| "\u0a2e\u0a39\u0a3e\u0a28 \u0a15\u0a4b\u0a38\u0a3c", | |
| "\u0a38\u0a42\u0a30\u0a1c \u0a2a\u0a4d\u0a30\u0a15\u0a3e\u0a38\u0a3c", | |
| "\u0a2a\u0a70\u0a25 \u0a2a\u0a4d\u0a30\u0a15\u0a3e\u0a38\u0a3c", | |
| "\u0a17\u0a41\u0a30 \u0a2a\u0a4d\u0a30\u0a24\u0a3e\u0a2a \u0a38\u0a42\u0a30\u0a1c \u0a17\u0a4d\u0a30\u0a70\u0a25", | |
| "\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30 \u0a38\u0a4b\u0a2d\u0a3e", | |
| "\u0a2c\u0a70\u0a38\u0a3e\u0a35\u0a32\u0a40\u0a28\u0a3e\u0a2e\u0a3e", | |
| "\u0a2e\u0a39\u0a3f\u0a2e\u0a3e \u0a2a\u0a4d\u0a30\u0a15\u0a3e\u0a38\u0a3c", | |
| "\u0a38\u0a3c\u0a2c\u0a26\u0a3e\u0a30\u0a25 \u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40", | |
| "\u0a2b\u0a30\u0a40\u0a26\u0a15\u0a4b\u0a1f \u0a35\u0a3e\u0a32\u0a3e \u0a1f\u0a40\u0a15\u0a3e", | |
| "\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a26\u0a30\u0a2a\u0a23", | |
| "\u0a38\u0a39\u0a3f\u0a1c \u0a2a\u0a3e\u0a20 \u0a26\u0a40 \u0a38\u0a70\u0a25\u0a3f\u0a06", | |
| "\u0a1c\u0a28\u0a2e \u0a38\u0a3e\u0a16\u0a40 \u0a2d\u0a3e\u0a08 \u0a2e\u0a28\u0a40 \u0a38\u0a3f\u0a70\u0a18", | |
| "\u0a1c\u0a28\u0a2e \u0a38\u0a3e\u0a16\u0a40 \u0a35\u0a3e\u0a32\u0a40 \u0a35\u0a3e\u0a32\u0a3e", | |
| "\u0a30\u0a39\u0a3f\u0a24\u0a28\u0a3e\u0a2e\u0a47", | |
| "\u0a1c\u0a70\u0a17\u0a28\u0a3e\u0a2e\u0a47", | |
| "\u0a38\u0a3f\u0a71\u0a16 \u0a10\u0a28\u0a38\u0a3e\u0a08\u0a15\u0a32\u0a4b\u0a2a\u0a40\u0a21\u0a40\u0a06", | |
| "\u0a17\u0a41\u0a30\u0a26\u0a41\u0a06\u0a30\u0a3e \u0a21\u0a47\u0a1f\u0a3e\u0a2c\u0a47\u0a38", | |
| ] | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 2. GLOBAL STATE | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| _vector_index: Optional[faiss.IndexFlatIP] = None | |
| _metadata_store: List[Dict] = [] | |
| _sqlite_path: Optional[str] = None | |
| _embedder: Optional[SentenceTransformer] = None | |
| _llm_client: Optional[InferenceClient] = None | |
| _bm25_index: object = None | |
| _bm25_ids: List[int] = [] | |
| _bm25_vocab: Dict[str, int] = {} | |
| _bm25_type: str = "" | |
| _source_facets = ["All Sources"] + sorted(KNOWN_SOURCES) | |
| _source_facets_lock = threading.Lock() | |
| _index_ready = threading.Event() | |
| _index_status = {"state": "starting", "progress": "", "error": None} | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 3. LRU EMBED CACHE | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| class _EmbedCache: | |
| def __init__(self, maxsize: int = EMBED_CACHE_SIZE) -> None: | |
| self._cache = OrderedDict() | |
| self._maxsize = maxsize | |
| self._lock = threading.Lock() | |
| self._hits = 0 | |
| self._misses = 0 | |
| def get(self, key: str) -> Optional[np.ndarray]: | |
| with self._lock: | |
| if key in self._cache: | |
| self._cache.move_to_end(key) | |
| self._hits += 1 | |
| return self._cache[key].copy() | |
| self._misses += 1 | |
| return None | |
| def put(self, key: str, value: np.ndarray) -> None: | |
| with self._lock: | |
| if key in self._cache: | |
| self._cache.move_to_end(key) | |
| else: | |
| if len(self._cache) >= self._maxsize: | |
| self._cache.popitem(last=False) | |
| self._cache[key] = value.copy() | |
| def stats(self) -> Dict: | |
| with self._lock: | |
| total = self._hits + self._misses | |
| hr = round(self._hits / total * 100, 1) if total else 0.0 | |
| return {"hit_rate": f"{hr}%", "size": len(self._cache), "capacity": self._maxsize} | |
| _embed_cache = _EmbedCache(maxsize=EMBED_CACHE_SIZE) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 4. LANGUAGE DETECTION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| _LANG_NAMES: Dict[str, str] = { | |
| "en": "English", "pa": "Punjabi", "hi": "Hindi", "ur": "Urdu", | |
| "ro": "Romanian", "fr": "French", "de": "German", "es": "Spanish", | |
| "it": "Italian", "pt": "Portuguese", "nl": "Dutch", "pl": "Polish", | |
| "ru": "Russian", "tr": "Turkish", "ar": "Arabic", "fa": "Persian", | |
| "zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)", | |
| "ja": "Japanese", "ko": "Korean", "sv": "Swedish", "no": "Norwegian", | |
| "da": "Danish", "fi": "Finnish", "cs": "Czech", "sk": "Slovak", | |
| "hu": "Hungarian", "bg": "Bulgarian", "hr": "Croatian", "sr": "Serbian", | |
| "uk": "Ukrainian", "el": "Greek", "he": "Hebrew", "th": "Thai", | |
| "bn": "Bengali", "ta": "Tamil", "te": "Telugu", "mr": "Marathi", | |
| "gu": "Gujarati", "kn": "Kannada", "ml": "Malayalam", | |
| } | |
| _GURMUKHI_RE = re.compile(r"[\u0A00-\u0A7F]") | |
| def _detect_language(text: str) -> str: | |
| if not text or len(text.strip()) < 10: | |
| return "" | |
| gk = len(_GURMUKHI_RE.findall(text)) | |
| if gk / max(len(text), 1) > 0.25: | |
| return "Punjabi (Gurmukhi)" | |
| if not _LANGDETECT_AVAILABLE: | |
| return "" | |
| try: | |
| code = _langdetect(text[:300]) | |
| return _LANG_NAMES.get(code, code.upper()) | |
| except Exception: | |
| return "" | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 5. INPUT SANITISATION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| _ALLOWED_RE = re.compile( | |
| r"[^\w\s.,;:!?()\'\"\-\u2013\u2014/\u0A00-\u0A7F\u0900-\u097F\u00C0-\u024F]", | |
| re.UNICODE, | |
| ) | |
| _ANG_FULL_RE = re.compile(r"(\u0a05\u0a70\u0a17[\s\u00A0]*\d+)", re.UNICODE) | |
| _ANG_NUM_RE = re.compile(r"\u0a05\u0a70\u0a17[\s\u00A0]*(\d+)", re.UNICODE) | |
| _ANG_IN_TEXT_RE = re.compile(r"\u0a05\u0a70\u0a17[\s\u00A0]*(\d+)", re.UNICODE) | |
| def _sanitize(text: str) -> str: | |
| if not isinstance(text, str): | |
| return "" | |
| text = html.unescape(text) | |
| text = _ALLOWED_RE.sub(" ", text) | |
| return re.sub(r"\s+", " ", text).strip()[:MAX_QUERY_LEN] | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 6. QUERY EXPANSION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| _QUERY_EXPANSIONS: Dict[str, List[str]] = { | |
| "waheguru": ["\u0a35\u0a3e\u0a39\u0a3f\u0a17\u0a41\u0a30\u0a42"], | |
| "wahiguru": ["\u0a35\u0a3e\u0a39\u0a3f\u0a17\u0a41\u0a30\u0a42"], | |
| "waheguruji": ["\u0a35\u0a3e\u0a39\u0a3f\u0a17\u0a41\u0a30\u0a42"], | |
| "wahe guru": ["\u0a35\u0a3e\u0a39\u0a3f\u0a17\u0a41\u0a30\u0a42"], | |
| "guru granth": ["\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40", "\u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25"], | |
| "guru granth sahib": ["\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40"], | |
| "sggs": ["\u0a38\u0a4d\u0a30\u0a40 \u0a17\u0a41\u0a30\u0a42 \u0a17\u0a30\u0a70\u0a25 \u0a38\u0a3e\u0a39\u0a3f\u0a2c \u0a1c\u0a40"], | |
| "japji": ["\u0a1c\u0a2a\u0a41 \u0a1c\u0a40 \u0a38\u0a3e\u0a39\u0a3f\u0a2c", "\u0a1c\u0a2a\u0a41\u0a1c\u0a40"], | |
| "japji sahib": ["\u0a1c\u0a2a\u0a41 \u0a1c\u0a40 \u0a38\u0a3e\u0a39\u0a3f\u0a2c"], | |
| "naam simran": ["\u0a28\u0a3e\u0a2e \u0a38\u0a3f\u0a2e\u0a30\u0a28"], | |
| "nam simran": ["\u0a28\u0a3e\u0a2e \u0a38\u0a3f\u0a2e\u0a30\u0a28"], | |
| "naam": ["\u0a28\u0a3e\u0a2e"], | |
| "simran": ["\u0a38\u0a3f\u0a2e\u0a30\u0a28"], | |
| "seva": ["\u0a38\u0a47\u0a35\u0a3e"], | |
| "sewa": ["\u0a38\u0a47\u0a35\u0a3e"], | |
| "ardas": ["\u0a05\u0a30\u0a26\u0a3e\u0a38"], | |
| "amritvela": ["\u0a05\u0a70\u0a2e\u0a4d\u0a30\u0a3f\u0a24 \u0a35\u0a47\u0a32\u0a3e"], | |
| "amrit vela": ["\u0a05\u0a70\u0a2e\u0a4d\u0a30\u0a3f\u0a24 \u0a35\u0a47\u0a32\u0a3e"], | |
| "amrit": ["\u0a05\u0a70\u0a2e\u0a4d\u0a30\u0a3f\u0a24"], | |
| "sant sipahi": ["\u0a38\u0a70\u0a24-\u0a38\u0a3f\u0a2a\u0a3e\u0a39\u0a40"], | |
| "sant-sipahi": ["\u0a38\u0a70\u0a24-\u0a38\u0a3f\u0a2a\u0a3e\u0a39\u0a40"], | |
| "chardi kala": ["\u0a1a\u0a5c\u0a4d\u0a39\u0a26\u0a40 \u0a15\u0a32\u0a3e"], | |
| "khalsa": ["\u0a16\u0a3c\u0a3e\u0a32\u0a38\u0a3e", "\u0a16\u0a3e\u0a32\u0a38\u0a3e"], | |
| "akal purakh": ["\u0a05\u0a15\u0a3e\u0a32 \u0a2a\u0a41\u0a30\u0a16"], | |
| "akaal purakh": ["\u0a05\u0a15\u0a3e\u0a32 \u0a2a\u0a41\u0a30\u0a16"], | |
| "gurbani": ["\u0a17\u0a41\u0a30\u0a2c\u0a3e\u0a23\u0a40"], | |
| "shabad": ["\u0a38\u0a3c\u0a2c\u0a26", "\u0a38\u0a2c\u0a26"], | |
| "shabads": ["\u0a38\u0a3c\u0a2c\u0a26", "\u0a38\u0a2c\u0a26"], | |
| "dasam granth": ["\u0a38\u0a4d\u0a30\u0a40 \u0a26\u0a38\u0a2e \u0a17\u0a4d\u0a30\u0a70\u0a25"], | |
| "dasam": ["\u0a38\u0a4d\u0a30\u0a40 \u0a26\u0a38\u0a2e \u0a17\u0a4d\u0a30\u0a70\u0a25"], | |
| "sarbloh": ["\u0a38\u0a4d\u0a30\u0a40 \u0a38\u0a30\u0a2c\u0a32\u0a4b\u0a39 \u0a17\u0a4d\u0a30\u0a70\u0a25"], | |
| "nitnem": ["\u0a28\u0a3f\u0a24\u0a28\u0a47\u0a2e"], | |
| "rehras": ["\u0a30\u0a39\u0a3f\u0a30\u0a3e\u0a38 \u0a38\u0a3e\u0a39\u0a3f\u0a2c"], | |
| "kirtan sohila": ["\u0a15\u0a40\u0a30\u0a24\u0a28 \u0a38\u0a4b\u0a39\u0a3f\u0a32\u0a3e"], | |
| "anand sahib": ["\u0a06\u0a28\u0a70\u0a26 \u0a38\u0a3e\u0a39\u0a3f\u0a2c"], | |
| "mool mantar": ["\u0a2e\u0a42\u0a32 \u0a2e\u0a70\u0a24\u0a30"], | |
| "mool mantra": ["\u0a2e\u0a42\u0a32 \u0a2e\u0a70\u0a24\u0a30"], | |
| "ik onkar": ["\u0a74", "\u0a07\u0a71\u0a15 \u0a13\u0a05\u0a70\u0a15\u0a3e\u0a30"], | |
| "haumai": ["\u0a39\u0a09\u0a2e\u0a48"], | |
| "hukam": ["\u0a39\u0a41\u0a15\u0a2e"], | |
| "nadar": ["\u0a28\u0a26\u0a30\u0a3f"], | |
| "kirpa": ["\u0a15\u0a3f\u0a30\u0a2a\u0a3e"], | |
| "grace": ["\u0a15\u0a3f\u0a30\u0a2a\u0a3e", "\u0a28\u0a26\u0a30\u0a3f"], | |
| "death": ["mortality", "passing", "\u0a2e\u0a4c\u0a24"], | |
| "fear": ["anxiety", "dread", "\u0a21\u0a30"], | |
| "mental health": ["inner peace", "wellbeing", "\u0a2e\u0a28"], | |
| "courage": ["bravery", "fearlessness", "\u0a39\u0a3f\u0a70\u0a2e\u0a24"], | |
| "love": ["devotion", "bhakti", "\u0a2a\u0a4d\u0a30\u0a47\u0a2e", "\u0a2d\u0a17\u0a24\u0a40"], | |
| "meditation": ["contemplation", "\u0a38\u0a3f\u0a2e\u0a30\u0a28", "\u0a27\u0a3f\u0a06\u0a28"], | |
| "prayer": ["supplication", "\u0a05\u0a30\u0a26\u0a3e\u0a38", "\u0a2a\u0a4d\u0a30\u0a3e\u0a30\u0a25\u0a28\u0a3e"], | |
| "equality": ["justice", "equity", "\u0a38\u0a2e\u0a3e\u0a28\u0a24\u0a3e"], | |
| "martyrdom": ["sacrifice", "shaheedi", "\u0a38\u0a3c\u0a39\u0a40\u0a26\u0a40"], | |
| "sovereignty": ["miri piri", "\u0a2e\u0a40\u0a30\u0a40 \u0a2a\u0a40\u0a30\u0a40"], | |
| "langar": ["\u0a32\u0a70\u0a17\u0a30", "community kitchen"], | |
| "gurdwara": ["\u0a17\u0a41\u0a30\u0a26\u0a41\u0a06\u0a30\u0a3e"], | |
| "gurdwaras": ["\u0a17\u0a41\u0a30\u0a26\u0a41\u0a06\u0a30\u0a47", "\u0a17\u0a41\u0a30\u0a26\u0a41\u0a06\u0a30\u0a3e"], | |
| } | |
| def _expand_query(query: str) -> str: | |
| q_lower = query.lower() | |
| expansions = [] | |
| for trigger, terms in _QUERY_EXPANSIONS.items(): | |
| if trigger in q_lower: | |
| expansions.extend(terms) | |
| if not expansions: | |
| return query | |
| existing = set(query.split()) | |
| new_terms = [t for t in expansions if t not in existing] | |
| return (query + " " + " ".join(new_terms)) if new_terms else query | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 7. CLIENT INITIALISATION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _init_llm_client() -> None: | |
| global _llm_client | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| raise EnvironmentError("HF_TOKEN not set.") | |
| _llm_client = InferenceClient(model=LLM_MODEL, token=token) | |
| print(f"\u2705 LLM client ready \u2014 {LLM_MODEL}") | |
| def _init_embedder() -> None: | |
| global _embedder | |
| print(f"Loading embedding model ({EMBEDDING_MODEL})...") | |
| _embedder = SentenceTransformer(EMBEDDING_MODEL, device="cpu") | |
| _embedder.encode(["warmup"], convert_to_numpy=True) | |
| print("\u2705 Embedding model ready") | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 8. INDEX LOADER | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _resolve_index_dir() -> Optional[Path]: | |
| for candidate in [BUCKET_DIR, CACHE_DIR]: | |
| if (candidate / INDEX_FILE).exists() and (candidate / META_FILE).exists(): | |
| print(f"\U0001f4c1 Index found at: {candidate}") | |
| return candidate | |
| return None | |
| def _download_from_hub(token: str) -> Path: | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| print(f"Downloading from {STORAGE_REPO}/{STORAGE_SUBDIR}...") | |
| for fname in [INDEX_FILE, SQLITE_FILE, META_FILE, BM25_FILE]: | |
| _index_status["progress"] = f"Downloading {fname}..." | |
| try: | |
| hf_hub_download( | |
| repo_id=STORAGE_REPO, filename=f"{STORAGE_SUBDIR}/{fname}", | |
| repo_type="dataset", local_dir=str(CACHE_DIR), token=token, | |
| ) | |
| src = CACHE_DIR / STORAGE_SUBDIR / fname | |
| dst = CACHE_DIR / fname | |
| if src.exists() and not dst.exists(): | |
| src.rename(dst) | |
| print(f" \u2705 {fname}") | |
| except Exception as e: | |
| print(f" \u26a0\ufe0f Could not download {fname}: {e}") | |
| return CACHE_DIR | |
| def _build_bm25_index(idx_dir: Path) -> None: | |
| global _bm25_index, _bm25_ids, _bm25_vocab, _bm25_type | |
| pkl_path = idx_dir / BM25_FILE | |
| if pkl_path.exists(): | |
| size_mb = pkl_path.stat().st_size / 1_048_576 | |
| _index_status["progress"] = "Loading pre-built BM25 index..." | |
| print(f"\U0001f4e6 Loading pre-built BM25 index ({size_mb:.0f} MB)...") | |
| t0 = time.time() | |
| try: | |
| with open(pkl_path, "rb") as f: | |
| payload = pickle.load(f) | |
| _bm25_index = payload["retriever"] | |
| _bm25_ids = payload["ids"] | |
| _bm25_vocab = payload.get("vocab", {}) | |
| _bm25_type = "bm25s" | |
| print(f"\u2705 BM25 loaded in {time.time()-t0:.1f}s \u2014 {len(_bm25_ids):,} docs") | |
| return | |
| except Exception as e: | |
| print(f"\u26a0\ufe0f bm25.pkl load failed ({e}) \u2014 rebuilding from SQLite") | |
| if _sqlite_path is None: | |
| print("\u26a0\ufe0f BM25 skipped \u2014 no pkl and no SQLite.") | |
| return | |
| _index_status["progress"] = "Building BM25 from SQLite..." | |
| print("\U0001f4e6 Building BM25 from SQLite...") | |
| t0 = time.time() | |
| try: | |
| conn = sqlite3.connect(_sqlite_path, check_same_thread=False) | |
| rows = conn.execute("SELECT id, text FROM chunks ORDER BY id").fetchall() | |
| conn.close() | |
| except Exception as e: | |
| print(f"\u26a0\ufe0f BM25 build failed: {e}") | |
| return | |
| _bm25_ids = [r[0] for r in rows] | |
| _bm25_index = BM25Okapi([r[1].split() for r in rows]) | |
| _bm25_vocab = {} | |
| _bm25_type = "rank-bm25" | |
| print(f"\u2705 BM25 (rank-bm25) built in {time.time()-t0:.1f}s") | |
| def _build_source_facets() -> None: | |
| global _source_facets | |
| seen = set() | |
| for m in _metadata_store: | |
| name = m.get("display_name", "") | |
| if name: | |
| seen.add(name) | |
| all_sources = seen | set(KNOWN_SOURCES) | |
| with _source_facets_lock: | |
| _source_facets = ["All Sources"] + sorted(all_sources) | |
| print(f"\u2705 Source facets ready \u2014 {len(_source_facets)-1} sources") | |
| def _load_index_background() -> None: | |
| global _vector_index, _metadata_store, _sqlite_path | |
| token = os.environ.get("HF_TOKEN", "") | |
| _index_status["state"] = "loading" | |
| try: | |
| _index_status["progress"] = "Loading embedding model..." | |
| _init_embedder() | |
| idx_dir = _resolve_index_dir() or _download_from_hub(token) | |
| _index_status["progress"] = "Loading FAISS index..." | |
| print("\U0001f4e6 Loading FAISS index...") | |
| _vector_index = faiss.read_index(str(idx_dir / INDEX_FILE)) | |
| _index_status["progress"] = "Loading metadata..." | |
| with open(idx_dir / META_FILE, "r", encoding="utf-8") as fh: | |
| _metadata_store = json.load(fh) | |
| sqlite_candidate = idx_dir / SQLITE_FILE | |
| if sqlite_candidate.exists(): | |
| _sqlite_path = str(sqlite_candidate) | |
| print(f"\u2705 SQLite ready: {_sqlite_path}") | |
| else: | |
| _sqlite_path = None | |
| print("\u26a0\ufe0f doc_store.sqlite not found") | |
| _build_bm25_index(idx_dir) | |
| _build_source_facets() | |
| n = len(_metadata_store) | |
| gc.collect() | |
| print(f"\u2705 All indexes ready \u2014 {n:,} chunks (FREE_TIER={FREE_TIER}, BM25={_bm25_type})") | |
| _index_status["state"] = "ready" | |
| _index_status["progress"] = f"{n:,} chunks indexed" | |
| _index_ready.set() | |
| except Exception as exc: | |
| _index_status["state"] = "error" | |
| _index_status["error"] = str(exc) | |
| print(f"\u274c Index load failed: {exc}") | |
| import traceback; traceback.print_exc() | |
| _index_ready.set() | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 9. SQLITE ACCESS | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _fetch_chunks_batch(ids: List[int]) -> Dict[int, str]: | |
| if not ids or _sqlite_path is None: | |
| return {} | |
| try: | |
| conn = sqlite3.connect(_sqlite_path, check_same_thread=False) | |
| ph = ",".join("?" * len(ids)) | |
| rows = conn.execute( | |
| f"SELECT id, text FROM chunks WHERE id IN ({ph})", ids | |
| ).fetchall() | |
| conn.close() | |
| return {r[0]: r[1] for r in rows} | |
| except Exception as e: | |
| print(f"\u26a0\ufe0f SQLite fetch: {e}") | |
| return {} | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 10. EMBEDDING | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _embed(query: str) -> np.ndarray: | |
| cached = _embed_cache.get(query) | |
| if cached is not None: | |
| return cached | |
| vec = _embedder.encode( | |
| [query], convert_to_numpy=True, normalize_embeddings=True | |
| ).astype(np.float32) | |
| _embed_cache.put(query, vec) | |
| return vec | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 11. BM25 SEARCH | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _bm25_search( | |
| query: str, category_filter: str, source_filter: str, k: int = RESEARCH_BM25_K, | |
| ) -> List[Dict]: | |
| if _bm25_index is None or not _bm25_ids: | |
| return [] | |
| try: | |
| if _bm25_type == "bm25s": | |
| query_tokens = bm25s_lib.tokenize( | |
| [query], stopwords=None, stemmer=None, show_progress=False, | |
| ) | |
| k_actual = min(k, len(_bm25_ids)) | |
| results, scores = _bm25_index.retrieve( | |
| query_tokens, k=k_actual, show_progress=False, | |
| ) | |
| hits: List[Dict] = [] | |
| for pos, score in zip(results[0], scores[0]): | |
| if score <= 0: | |
| continue | |
| chunk_id = _bm25_ids[int(pos)] | |
| if chunk_id >= len(_metadata_store): | |
| continue | |
| meta = _metadata_store[chunk_id] | |
| if category_filter != "All" and meta.get("category") != category_filter: | |
| continue | |
| if source_filter != "All Sources" and meta.get("display_name") != source_filter: | |
| continue | |
| hits.append({"idx": chunk_id, "score": float(score), "meta": meta}) | |
| return hits | |
| tokens = query.split() | |
| scores_arr = _bm25_index.get_scores(tokens) | |
| if len(scores_arr) <= k: | |
| top_pos = np.argsort(scores_arr)[::-1] | |
| else: | |
| top_pos = np.argpartition(scores_arr, -k)[-k:] | |
| top_pos = top_pos[np.argsort(scores_arr[top_pos])[::-1]] | |
| hits = [] | |
| for pos in top_pos: | |
| if scores_arr[pos] <= 0: | |
| break | |
| chunk_id = _bm25_ids[int(pos)] | |
| if chunk_id >= len(_metadata_store): | |
| continue | |
| meta = _metadata_store[chunk_id] | |
| if category_filter != "All" and meta.get("category") != category_filter: | |
| continue | |
| if source_filter != "All Sources" and meta.get("display_name") != source_filter: | |
| continue | |
| hits.append({"idx": chunk_id, "score": float(scores_arr[pos]), "meta": meta}) | |
| return hits | |
| except Exception as e: | |
| print(f"\u26a0\ufe0f BM25 search: {e}") | |
| return [] | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 12. FAISS VECTOR SEARCH | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _faiss_search( | |
| query_vec: np.ndarray, category_filter: str, source_filter: str, k: int, | |
| ) -> List[Dict]: | |
| k_actual = min(k, _vector_index.ntotal) | |
| scores, indices = _vector_index.search(query_vec, k_actual) | |
| hits: List[Dict] = [] | |
| for idx, score in zip(indices[0], scores[0]): | |
| if idx < 0: | |
| continue | |
| meta = _metadata_store[idx] if idx < len(_metadata_store) else {} | |
| if category_filter != "All" and meta.get("category") != category_filter: | |
| continue | |
| if source_filter != "All Sources" and meta.get("display_name") != source_filter: | |
| continue | |
| hits.append({"idx": int(idx), "score": float(score), "meta": meta}) | |
| return hits | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 13. RECIPROCAL RANK FUSION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _rrf_fuse(list_a: List[Dict], list_b: List[Dict], k: int = RRF_K) -> List[Dict]: | |
| rrf: Dict[int, float] = {} | |
| for rank, h in enumerate(list_a, 1): | |
| rrf[h["idx"]] = rrf.get(h["idx"], 0.0) + 1.0 / (k + rank) | |
| for rank, h in enumerate(list_b, 1): | |
| rrf[h["idx"]] = rrf.get(h["idx"], 0.0) + 1.0 / (k + rank) | |
| meta_map = {h["idx"]: h["meta"] for h in list_a + list_b} | |
| return sorted( | |
| [{"idx": i, "score": s, "meta": meta_map[i]} for i, s in rrf.items()], | |
| key=lambda x: x["score"], reverse=True, | |
| ) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 14. MULTI-QUERY DECOMPOSITION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| _SPLIT_RE = re.compile( | |
| r"\b(?:and|vs\.?|versus|compared to|in relation to|" | |
| r"as well as|along with|both|also|additionally)\b", | |
| re.IGNORECASE, | |
| ) | |
| def _multi_query_search( | |
| query: str, category_filter: str, source_filter: str, | |
| k_per_subquery: int = 200, | |
| ) -> List[Dict]: | |
| parts = [p.strip() for p in _SPLIT_RE.split(query) if len(p.strip()) > MIN_QUERY_LEN] | |
| if len(parts) <= 1: | |
| expanded = _expand_query(query) | |
| q_vec = _embed(expanded) | |
| vec_hits = _faiss_search(q_vec, category_filter, source_filter, k_per_subquery) | |
| bm25_hits = _bm25_search(expanded, category_filter, source_filter, k_per_subquery) | |
| return _rrf_fuse(vec_hits, bm25_hits) | |
| print(f"[multi-query] {len(parts)} sub-queries") | |
| fused: List[Dict] = [] | |
| for part in parts[:MAX_SUBQUERIES]: | |
| expanded = _expand_query(part) | |
| q_vec = _embed(expanded) | |
| vec_hits = _faiss_search(q_vec, category_filter, source_filter, k_per_subquery) | |
| bm25_hits = _bm25_search(expanded, category_filter, source_filter, k_per_subquery) | |
| sub = _rrf_fuse(vec_hits, bm25_hits) | |
| fused = sub if not fused else _rrf_fuse(fused, sub) | |
| return fused | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 15. MAXIMAL MARGINAL RELEVANCE | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _mmr_rerank( | |
| query_vec: np.ndarray, hits: List[Dict], | |
| top_k: int = MMR_TOP_K, lambda_: float = MMR_LAMBDA, | |
| ) -> List[Dict]: | |
| if FREE_TIER or not hits or len(hits) <= 1: | |
| return hits[:top_k] | |
| try: | |
| chunk_vecs: Dict[int, np.ndarray] = {} | |
| for h in hits: | |
| idx = h["idx"] | |
| if idx < _vector_index.ntotal: | |
| vec = np.zeros((1, _vector_index.d), dtype=np.float32) | |
| _vector_index.reconstruct(idx, vec[0]) | |
| chunk_vecs[idx] = vec | |
| except Exception as e: | |
| print(f"\u26a0\ufe0f MMR skipped: {e}") | |
| return hits[:top_k] | |
| q_flat = query_vec.flatten() | |
| candidates = [h for h in hits if h["idx"] in chunk_vecs] | |
| selected: List[Dict] = [] | |
| while candidates and len(selected) < top_k: | |
| if not selected: | |
| best = candidates[0] | |
| else: | |
| sel_vecs = np.vstack([chunk_vecs[s["idx"]].flatten() for s in selected]) | |
| best_score = -np.inf | |
| best = candidates[0] | |
| for h in candidates: | |
| v_flat = chunk_vecs[h["idx"]].flatten() | |
| relevance = float(np.dot(q_flat, v_flat)) | |
| redundancy = float(np.max(sel_vecs @ v_flat)) | |
| mmr_score = lambda_ * relevance - (1.0 - lambda_) * redundancy | |
| if mmr_score > best_score: | |
| best_score = mmr_score | |
| best = h | |
| selected.append(best) | |
| candidates.remove(best) | |
| return selected | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 16. SCORE CALIBRATION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _calibrate_scores(hits: List[Dict]) -> List[Dict]: | |
| if not hits: | |
| return hits | |
| scores = np.array([h["score"] for h in hits], dtype=np.float32) | |
| max_s = float(scores.max()) | |
| min_s = float(scores.min()) | |
| p75 = float(np.percentile(scores, 75)) | |
| p50 = float(np.percentile(scores, 50)) | |
| p25 = float(np.percentile(scores, 25)) | |
| span = max_s - min_s if max_s > min_s else 1.0 | |
| for h in hits: | |
| s = h["score"] | |
| h["relevance_pct"] = max(1, round(((s - min_s) / span) * 100)) | |
| h["tier"] = ( | |
| "\U0001f7e2 Highly Relevant" if s >= p75 else | |
| "\U0001f535 Relevant" if s >= p50 else | |
| "\U0001f7e1 Contextual" if s >= p25 else | |
| "\u26aa Peripheral" | |
| ) | |
| return hits | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 17. PER-SOURCE DIVERSITY CAP | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _apply_source_cap(hits: List[Dict], source_filter: str) -> List[Dict]: | |
| if source_filter != "All Sources": | |
| return hits | |
| counts: Dict[str, int] = {} | |
| result: List[Dict] = [] | |
| for h in hits: | |
| name = h["meta"].get("display_name", "unknown") | |
| if counts.get(name, 0) < PER_SOURCE_CAP: | |
| result.append(h) | |
| counts[name] = counts.get(name, 0) + 1 | |
| return result | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 18. KEYWORD-DENSITY SNIPPET | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _best_snippet_window(text: str, query: str, window_words: int = SNIPPET_WORDS) -> str: | |
| query_tokens = set(query.lower().split()) | |
| words = text.split() | |
| if len(words) <= window_words: | |
| return text | |
| best_score, best_start = -1, 0 | |
| for start in range(0, len(words) - window_words, SNIPPET_STEP): | |
| score = sum(1 for w in words[start:start + window_words] if w.lower() in query_tokens) | |
| if score > best_score: | |
| best_score, best_start = score, start | |
| snippet = " ".join(words[best_start : best_start + window_words]) | |
| if best_start > 0: | |
| snippet = "..." + snippet | |
| if best_start + window_words < len(words): | |
| snippet += "..." | |
| return snippet | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 19. ਅੰਗ DEEP-LINK | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _ang_deep_link(text: str, category: str) -> Optional[str]: | |
| if category not in ("Gurbani", "\u0a17\u0a41\u0a30\u0a2c\u0a3e\u0a23\u0a40"): | |
| return None | |
| m = _ANG_IN_TEXT_RE.search(text) | |
| return f"https://www.sikhitothemax.org/ang?ang={m.group(1)}&source=G" if m else None | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 20. STRUCTURED PASSAGE EXTRACTOR | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| _RE_GURMUKHI_F = re.compile(r"['\"]gurmukhi['\"]:\s*['\"]([^'\"]{2,}?)['\"]") | |
| _RE_PRONUN = re.compile(r"['\"]pronunciation['\"]:\s*['\"]([^'\"]{2,}?)['\"]") | |
| _RE_TRANSLATION = re.compile(r"['\"]translation['\"]:\s*['\"]([^'\"]{2,}?)['\"]") | |
| _RE_EXPLANATION = re.compile(r"['\"]explanation['\"]:\s*['\"](.{10,}?)['\"](?:\s*[,}])") | |
| _RE_ANG_FIELD = re.compile(r"['\"]ang['\"]:\s*(\d+)") | |
| def _extract_structured_passage(text: str, query: str) -> Dict: | |
| if "'gurmukhi'" not in text and '"gurmukhi"' not in text: | |
| return {"type": "plain", "snippet": _best_snippet_window(text, query)} | |
| try: | |
| gurmukhi_list = _RE_GURMUKHI_F.findall(text) | |
| pronun_list = _RE_PRONUN.findall(text) | |
| translation_list = _RE_TRANSLATION.findall(text) | |
| explanation_list = _RE_EXPLANATION.findall(text) | |
| ang_list = _RE_ANG_FIELD.findall(text) | |
| if not gurmukhi_list: | |
| return {"type": "plain", "snippet": _best_snippet_window(text, query)} | |
| n = len(gurmukhi_list) | |
| def _s(lst, i): return lst[i] if i < len(lst) else "" | |
| q_tokens = set(query.lower().split()) | |
| best_i, best_score = 0, -1 | |
| for i in range(n): | |
| combined = (gurmukhi_list[i] + " " + _s(pronun_list, i) + " " + | |
| _s(translation_list, i) + " " + _s(explanation_list, i)).lower() | |
| score = sum(1 for tok in q_tokens if tok in combined) | |
| if score > best_score: | |
| best_score, best_i = score, i | |
| explanation = _s(explanation_list, best_i) | |
| if len(explanation) > 500: | |
| explanation = explanation[:497] + "..." | |
| translation = _s(translation_list, best_i) | |
| pronunciation = _s(pronun_list, best_i) | |
| return { | |
| "type": "structured", | |
| "gurmukhi": gurmukhi_list[best_i], | |
| "pronunciation": pronunciation, | |
| "pronunciation_lang": _detect_language(pronunciation), | |
| "translation": translation, | |
| "translation_lang": _detect_language(translation), | |
| "explanation": explanation, | |
| "explanation_lang": _detect_language(explanation), | |
| "ang": ang_list[best_i] if best_i < len(ang_list) else "", | |
| "total_lines": n, | |
| } | |
| except Exception: | |
| return {"type": "plain", "snippet": _best_snippet_window(text, query)} | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 21. ANALYTICS | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _log_query_async( | |
| query: str, category: str, mode: str, | |
| n_results: int, top_score: float, elapsed_ms: float, | |
| ) -> None: | |
| def _write() -> None: | |
| try: | |
| ANALYTICS_DB.parent.mkdir(parents=True, exist_ok=True) | |
| conn = sqlite3.connect(str(ANALYTICS_DB), check_same_thread=False) | |
| conn.execute("""CREATE TABLE IF NOT EXISTS queries ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| ts TEXT NOT NULL, query_len INTEGER NOT NULL, | |
| category TEXT NOT NULL, mode TEXT NOT NULL, | |
| n_results INTEGER NOT NULL, top_score REAL NOT NULL, | |
| elapsed_ms REAL NOT NULL)""") | |
| conn.execute("INSERT INTO queries VALUES (NULL,?,?,?,?,?,?,?)", | |
| (time.strftime("%Y-%m-%dT%H:%M:%SZ"), len(query), | |
| category, mode, n_results, round(top_score, 4), round(elapsed_ms, 1))) | |
| conn.commit() | |
| conn.close() | |
| except Exception: | |
| pass | |
| threading.Thread(target=_write, daemon=True).start() | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 22. RESEARCH MODE | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _research(query: str, category_filter: str, source_filter: str) -> Tuple[str, str]: | |
| t_start = time.time() | |
| fused = _multi_query_search( | |
| query, category_filter, source_filter, | |
| k_per_subquery=max(RESEARCH_FAISS_K, RESEARCH_BM25_K), | |
| ) | |
| total_candidates = len(fused) | |
| if not fused: | |
| msg = ("No relevant sources found.\n\n" | |
| "_Try broader keywords, remove filters, or check Gurmukhi spelling._") | |
| return msg, msg | |
| fused = _apply_source_cap(fused, source_filter) | |
| above = [h for h in fused if h["score"] >= RELEVANCE_THRESHOLD] | |
| candidates = above if len(above) >= MAX_RESEARCH_PASSAGES else fused[:MAX_RESEARCH_PASSAGES] | |
| anchor_vec = _embed(_expand_query(query)) | |
| candidates = _mmr_rerank(anchor_vec, candidates, top_k=MAX_RESEARCH_PASSAGES) | |
| candidates = _calibrate_scores(candidates) | |
| ids = [h["idx"] for h in candidates] | |
| chunk_txts = _fetch_chunks_batch(ids) | |
| accessed = time.strftime("%B %d, %Y") | |
| tier_note = "" if FREE_TIER else " \u00b7 MMR diversity" | |
| unique_sources = len({h["meta"].get("display_name", "") for h in candidates}) | |
| md: List[str] = [] | |
| plain: List[str] = [] | |
| md += [ | |
| "## Research Results", | |
| (f"*Showing **{len(candidates)}** passages from **{unique_sources}** sources " | |
| f"\u00b7 {total_candidates:,} candidates \u00b7 hybrid BM25 + vector{tier_note}*"), | |
| "", | |
| ("> **Relevance %** = how this passage ranks within these results " | |
| "(100% = most relevant passage in this query). " | |
| "**Signal** = percentile tier."), | |
| "", "---", "", | |
| ] | |
| plain += [ | |
| "RESEARCH RESULTS \u2014 SikhLibrary Digital Archive", | |
| "=" * 65, | |
| f"Passages shown : {len(candidates)}", | |
| f"Unique sources : {unique_sources}", | |
| f"Total candidates: {total_candidates:,}", | |
| f"Accessed : {accessed}", | |
| "=" * 65, "", | |
| ] | |
| displayed = 0 | |
| for i, h in enumerate(candidates, 1): | |
| idx = h["idx"] | |
| text = chunk_txts.get(idx, "") | |
| if not text: | |
| continue | |
| meta = h["meta"] | |
| display = meta.get("display_name", meta.get("file", "Unknown")) | |
| category = meta.get("category", "General") | |
| cat_disp = meta.get("category_display", category) | |
| section = meta.get("chunk_idx", 0) + 1 | |
| relevance = h.get("relevance_pct", 0) | |
| tier = h.get("tier", "") | |
| passage = _extract_structured_passage(text, query) | |
| ang_link = None | |
| if passage["type"] == "structured" and passage.get("ang"): | |
| ang_link = f"https://www.sikhitothemax.org/ang?ang={passage['ang']}&source=G" | |
| else: | |
| ang_link = _ang_deep_link(text, category) | |
| chicago = ( | |
| f'"{display}." In *SikhLibrary Digital Archive*, ' | |
| f"section {section}, category: {cat_disp}. " | |
| f"Hugging Face Datasets. Accessed {accessed}. " | |
| f"https://huggingface.co/datasets/jsdosanj/SikhLibrary." | |
| ) | |
| md += [f"### [{i}] {display} \u2014 Section {section}"] | |
| md += ["| | |", "|---|---|", | |
| f"| **Source** | {display} |", | |
| f"| **Category** | {cat_disp} |", | |
| f"| **Section** | {section} |", | |
| f"| **Relevance** | {relevance}% |", | |
| f"| **Signal** | {tier} |", ""] | |
| if ang_link: | |
| ang_num = (passage.get("ang", "") or | |
| (m.group(1) if (m := _ANG_IN_TEXT_RE.search(text)) else "")) | |
| link_label = (f"View \u0a05\u0a70\u0a17 {ang_num} on SikhiToTheMax" | |
| if ang_num else "View on SikhiToTheMax") | |
| md += [f"[{link_label}]({ang_link})", ""] | |
| if passage["type"] == "structured": | |
| if passage["gurmukhi"]: | |
| md += ["**Gurmukhi (Punjabi \u2014 Gurmukhi script):**", | |
| f"> {passage['gurmukhi']}", ""] | |
| if passage["pronunciation"]: | |
| ll = passage["pronunciation_lang"] | |
| lbl = f"Pronunciation ({ll})" if ll else "Pronunciation" | |
| md += [f"**{lbl}:**", f"> {passage['pronunciation']}", ""] | |
| if passage["translation"]: | |
| ll = passage["translation_lang"] | |
| lbl = f"Translation ({ll})" if ll else "Translation" | |
| md += [f"**{lbl}:**", f"> {passage['translation']}", ""] | |
| if passage["explanation"]: | |
| ll = passage["explanation_lang"] | |
| lbl = f"Explanation ({ll})" if ll else "Explanation" | |
| md += [f"**{lbl}:**", f"> {passage['explanation']}", ""] | |
| if passage.get("total_lines", 1) > 1: | |
| md += [f"*({passage['total_lines']} lines in this section)*", ""] | |
| else: | |
| ll = _detect_language(passage["snippet"]) | |
| lbl = f"Excerpt ({ll})" if ll else "Excerpt" | |
| md += [f"**{lbl}:**", f"> {passage['snippet']}", ""] | |
| md += ["**Chicago Manual of Style (17th ed.):**", | |
| f"> {chicago}", "", "---", ""] | |
| plain += [ | |
| f"[{i}] {display} \u2014 Section {section}", | |
| f" Source : {display}", | |
| f" Category : {cat_disp}", | |
| f" Relevance : {relevance}%", | |
| f" Signal : {tier}", | |
| ] | |
| if ang_link: | |
| plain += [f" SikhiToTheMax: {ang_link}"] | |
| if passage["type"] == "structured": | |
| if passage["gurmukhi"]: | |
| plain += [f" Gurmukhi: {passage['gurmukhi']}"] | |
| if passage["pronunciation"]: | |
| ll = passage["pronunciation_lang"] | |
| plain += [f" Pronunciation ({ll}): {passage['pronunciation']}"] | |
| if passage["translation"]: | |
| ll = passage["translation_lang"] | |
| plain += [f" Translation ({ll}): {passage['translation']}"] | |
| if passage["explanation"]: | |
| ll = passage["explanation_lang"] | |
| plain += [f" Explanation ({ll}): {passage['explanation']}"] | |
| else: | |
| ll = _detect_language(passage["snippet"]) | |
| plain += [f" Excerpt ({ll}): {passage['snippet']}"] | |
| plain += [f" {chicago.replace('*', '')}", ""] | |
| displayed += 1 | |
| plain += ["=" * 65, | |
| "Archive: https://huggingface.co/datasets/jsdosanj/SikhLibrary"] | |
| elapsed_ms = (time.time() - t_start) * 1000 | |
| _log_query_async(query, category_filter, "Research", | |
| displayed, candidates[0]["score"] if candidates else 0.0, elapsed_ms) | |
| print(f"[research] {displayed} passages / {unique_sources} sources " | |
| f"in {elapsed_ms:.0f}ms BM25={_bm25_type}") | |
| return "\n".join(md), "\n".join(plain) | |
| def _make_download_file(plain_text: str) -> Optional[str]: | |
| if not plain_text.strip(): | |
| return None | |
| try: | |
| tmp = tempfile.NamedTemporaryFile( | |
| mode="w", suffix=".txt", delete=False, encoding="utf-8", | |
| prefix="SikhLibrarian_research_", | |
| ) | |
| tmp.write(plain_text) | |
| tmp.close() | |
| return tmp.name | |
| except Exception as e: | |
| print(f"\u26a0\ufe0f Temp file: {e}") | |
| return None | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 23. HALLUCINATION VERIFICATION | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _verify_ang_references(response: str, all_context: str) -> str: | |
| def _check(match: re.Match) -> str: | |
| ref = match.group(1) | |
| num_m = _ANG_NUM_RE.search(ref) | |
| if not num_m: | |
| return ref | |
| return ref if num_m.group(1) in all_context else f"~~{ref}~~ \u26a0\ufe0f" | |
| verified = _ANG_FULL_RE.sub(_check, response) | |
| if verified != response: | |
| verified += ("\n\n---\n> \u26a0\ufe0f *References shown as ~~struck through~~ could not " | |
| "be verified in the retrieved context. Please cross-check.*") | |
| return verified | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 24. LEARN MODE | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| _LEARN_SYSTEM_TEMPLATE = """\ | |
| You are the AI Sikh Librarian — a world-class scholar of Sikh theology, history, and literature. | |
| Write at PhD academic depth in clear English that a motivated high-school student can follow. | |
| MANDATORY LANGUAGE RULES: | |
| - Main body: English only. | |
| - All key concepts / theological terms: Punjabi Unicode ONLY. | |
| Examples: ਸੰਤ-ਸਿਪਾਹੀ, ਨਾਮ ਸਿਮਰਨ, ਚੜ੍ਹਦੀ ਕਲਾ, ਵਾਹਿਗੁਰੂ, ਅਕਾਲ ਪੁਰਖ | |
| NEVER write English transliterations like "Waheguru", "Nam Simran", "Chardi Kala". | |
| - Cite Gurbani by ਅੰਗ number ONLY if that exact number appears verbatim in CONTEXT. | |
| ABSOLUTE PROHIBITIONS: | |
| - Do NOT quote any Gurbani not present verbatim in CONTEXT. | |
| - Do NOT fabricate ਅੰਗ numbers. | |
| - Do NOT cite sources not in CONTEXT. | |
| RESPONSE STRUCTURE: | |
| 1. Introduction — 1-2 paragraphs | |
| 2. Deep Analysis — cite every claim [Source N]; Punjabi Unicode for all key terms | |
| 3. Cross-Source Synthesis | |
| 4. Works Cited — Chicago 17th ed. (only sources listed below) | |
| RETRIEVED CONTEXT: | |
| {context} | |
| SOURCE METADATA: | |
| {citation_block}""" | |
| def _build_learn_context(hits: List[Dict], chunk_texts: Dict[int, str]) -> Tuple[str, str]: | |
| ctx_parts: List[str] = [] | |
| cite_parts: List[str] = [] | |
| seen_names: set = set() | |
| accessed = time.strftime("%B %d, %Y") | |
| for rank, h in enumerate(hits, 1): | |
| meta = h["meta"] | |
| idx = h["idx"] | |
| text = chunk_texts.get(idx, "").strip() | |
| if not text: | |
| continue | |
| trimmed = " ".join(text.split()[:MAX_CONTEXT_WORDS]) | |
| display = meta.get("display_name", meta.get("file", "Unknown")) | |
| cat_disp = meta.get("category_display", meta.get("category", "General")) | |
| section = meta.get("chunk_idx", 0) + 1 | |
| ctx_parts.append(f"[Source {rank}: {display}, Section {section}]\n{trimmed}") | |
| if display not in seen_names: | |
| seen_names.add(display) | |
| cite_parts.append( | |
| f'[Source {rank}] "{display}." In *SikhLibrary Digital Archive*, ' | |
| f"section {section}, category: {cat_disp}. Hugging Face Datasets. " | |
| f"Accessed {accessed}. https://huggingface.co/datasets/jsdosanj/SikhLibrary." | |
| ) | |
| return "\n\n---\n\n".join(ctx_parts), "\n".join(cite_parts) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 25. LLM STREAMING WITH RETRY | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _is_retryable(err: str) -> bool: | |
| if "402" in err: | |
| return False | |
| el = err.lower() | |
| return ("429" in err or "503" in err or "502" in err or "rate" in el | |
| or "overloaded" in el or "timeout" in el or "connection" in el) | |
| def _stream_with_retry(messages: List[Dict], history: list) -> Generator: | |
| last_err = "" | |
| attempt = 0 | |
| while attempt <= LLM_MAX_RETRIES: | |
| response_text = "" | |
| if attempt > 0: | |
| delay = LLM_RETRY_DELAYS[min(attempt - 1, len(LLM_RETRY_DELAYS) - 1)] | |
| history[-1]["content"] = ( | |
| f"\u23f3 **API busy** (attempt {attempt}/{LLM_MAX_RETRIES}) \u2014 " | |
| f"retrying in {delay}s...\n\n" | |
| "*Qwen2.5-72B free tier has limited concurrency.*" | |
| ) | |
| yield history, False | |
| time.sleep(delay) | |
| try: | |
| for delta in _llm_client.chat_completion( | |
| messages=messages, max_tokens=LLM_MAX_TOKENS, | |
| stream=True, temperature=LLM_TEMPERATURE, top_p=LLM_TOP_P, | |
| ): | |
| if not delta.choices: | |
| continue | |
| token = delta.choices[0].delta.content | |
| if token: | |
| response_text += token | |
| history[-1]["content"] = response_text | |
| yield history, False | |
| yield history, True | |
| return | |
| except Exception as e: | |
| err = str(e) | |
| if response_text.strip(): | |
| print(f"Post-stream exception (response preserved): {err}") | |
| yield history, True | |
| return | |
| last_err = err | |
| print(f"LLM attempt {attempt+1} failed: {err}") | |
| if _is_retryable(err) and attempt < LLM_MAX_RETRIES: | |
| attempt += 1 | |
| continue | |
| break | |
| if "402" in last_err: | |
| msg = ("\U0001f4b3 **Learn mode temporarily unavailable** \u2014 monthly credits used up.\n\n" | |
| "**Research mode** is fully available for instant citations.") | |
| elif "429" in last_err or "rate" in last_err.lower(): | |
| msg = (f"\u23f3 **HF Inference API overloaded.** Tried {LLM_MAX_RETRIES+1} times. " | |
| "Please wait 1-2 min or switch to **Research mode**.") | |
| elif "401" in last_err or "403" in last_err: | |
| msg = "\U0001f511 **Auth error.** Check `HF_TOKEN` in Space Settings > Secrets." | |
| else: | |
| msg = "\u26a0\ufe0f **LLM unavailable after retries.** Try again or use **Research mode**." | |
| print(f"LLM final error: {last_err}") | |
| history[-1]["content"] = msg | |
| yield history, True | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 26. GRADIO EVENT HANDLERS | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _not_ready_msg() -> str: | |
| return (f"\u23f3 **Index still loading** (happens once on first startup).\n\n" | |
| f"**Status:** `{_index_status.get('state', 'loading')}`\n" | |
| f"**Progress:** {_index_status.get('progress', '...')}\n\n" | |
| "Please try again in a moment.") | |
| def get_source_choices() -> gr.update: | |
| _index_ready.wait(timeout=300) | |
| with _source_facets_lock: | |
| choices = list(_source_facets) | |
| return gr.update(choices=choices, value="All Sources") | |
| def submit_query( | |
| message: str, history: list, mode: str, category: str, source_filter: str, | |
| ) -> Generator: | |
| clean = _sanitize(message) | |
| if len(clean) < MIN_QUERY_LEN: | |
| history = history + [ | |
| {"role": "user", "content": message}, | |
| {"role": "assistant", "content": "Please enter at least 3 characters."}, | |
| ] | |
| yield history, "", gr.update(visible=False) | |
| return | |
| if not _index_ready.is_set(): | |
| history = history + [ | |
| {"role": "user", "content": message}, | |
| {"role": "assistant", "content": _not_ready_msg()}, | |
| ] | |
| yield history, "", gr.update(visible=False) | |
| return | |
| if _index_status["state"] == "error": | |
| history = history + [ | |
| {"role": "user", "content": message}, | |
| {"role": "assistant", "content": "\u274c **Index unavailable** \u2014 check Space logs."}, | |
| ] | |
| yield history, "", gr.update(visible=False) | |
| return | |
| history = history + [{"role": "user", "content": message}] | |
| yield history, "", gr.update(visible=False) | |
| if mode == "Research": | |
| try: | |
| md_out, plain_out = _research(clean, category, source_filter) | |
| except Exception: | |
| import traceback; traceback.print_exc() | |
| md_out, plain_out = "\u26a0\ufe0f Retrieval error \u2014 please try again.", "" | |
| history = history + [{"role": "assistant", "content": md_out}] | |
| dl_path = _make_download_file(plain_out) | |
| yield history, plain_out, gr.update(visible=bool(dl_path), value=dl_path) | |
| return | |
| if _sqlite_path is None: | |
| history = history + [{"role": "assistant", "content": | |
| "\u26a0\ufe0f **Learn mode requires doc_store.sqlite** \u2014 not found.\n\n" | |
| "**Research mode** is fully available."}] | |
| yield history, "", gr.update(visible=False) | |
| return | |
| try: | |
| expanded = _expand_query(clean) | |
| q_vec = _embed(expanded) | |
| hits = _faiss_search(q_vec, category, source_filter, LEARN_FAISS_K)[:LEARN_MODE_K] | |
| ids = [h["idx"] for h in hits] | |
| chunk_txts = _fetch_chunks_batch(ids) | |
| ctx_str, cite_block = _build_learn_context(hits, chunk_txts) | |
| all_context = " ".join(chunk_txts.values()) | |
| except Exception: | |
| import traceback; traceback.print_exc() | |
| history = history + [{"role": "assistant", "content": | |
| "\u26a0\ufe0f Retrieval error \u2014 please try again."}] | |
| yield history, "", gr.update(visible=False) | |
| return | |
| if not ctx_str.strip(): | |
| history = history + [{"role": "assistant", "content": | |
| "No relevant sources found.\n\n" | |
| "_Try different keywords or switch to **Research mode**._"}] | |
| yield history, "", gr.update(visible=False) | |
| return | |
| system_msg = _LEARN_SYSTEM_TEMPLATE.format(context=ctx_str, citation_block=cite_block) | |
| prior = history[:-1] | |
| recent = prior[-6:] if len(prior) > 6 else prior | |
| messages = [{"role": "system", "content": system_msg}] | |
| messages.extend(recent) | |
| messages.append({"role": "user", "content": clean}) | |
| history = history + [{"role": "assistant", "content": "\u23f3 Thinking..."}] | |
| final_history = history | |
| for final_history, is_done in _stream_with_retry(messages, history): | |
| yield final_history, "", gr.update(visible=False) | |
| if is_done: | |
| break | |
| final_text = final_history[-1]["content"] | |
| if final_text and not final_text.startswith(("\u23f3", "\u26a0\ufe0f", "\U0001f511", "\U0001f4b3")): | |
| final_history[-1]["content"] = _verify_ang_references(final_text, all_context) | |
| yield final_history, "", gr.update(visible=False) | |
| stats = _embed_cache.stats | |
| print(f"[cache] {stats['hit_rate']} \u00b7 {stats['size']}/{stats['capacity']}") | |
| _log_query_async(clean, category, "Learn", | |
| len(hits), hits[0]["score"] if hits else 0.0, 0.0) | |
| def clear_conversation() -> Tuple: | |
| return [], "", gr.update(visible=False, value=None) | |
| def toggle_mode_ui(mode: str) -> Tuple: | |
| is_research = mode == "Research" | |
| placeholder = ( | |
| "Search every passage across 758M+ words... " | |
| "(e.g. 'naam simran', 'seva', 'Khalsa', 'waheguru')" | |
| if is_research | |
| else "Message Sikh Librarian\u2026" | |
| ) | |
| return ( | |
| gr.update(visible=is_research), | |
| gr.update(visible=False, value=None), | |
| gr.update(placeholder=placeholder), | |
| gr.update(visible=is_research), | |
| ) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 27. DESIGN SYSTEM — "Midnight Archive" | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap'); | |
| @keyframes ma-slide-up { | |
| from { opacity: 0; transform: translateY(12px) scale(0.97); } | |
| to { opacity: 1; transform: translateY(0) scale(1); } | |
| } | |
| @keyframes ma-pulse-glow { | |
| 0%, 100% { box-shadow: 0 0 6px rgba(52,199,89,0.45); } | |
| 50% { box-shadow: 0 0 14px rgba(52,199,89,0.80); } | |
| } | |
| :root { | |
| --ma-bg: #0b0b0f; | |
| --ma-surface: #13141a; | |
| --ma-surface2: #1a1b22; | |
| --ma-surface3: #202129; | |
| --ma-border: rgba(255,255,255,0.08); | |
| --ma-border-focus: rgba(0,122,255,0.70); | |
| --ma-separator: #2c2c2e; | |
| --ma-text: #e2e2e7; | |
| --ma-text2: #98989f; | |
| --ma-text3: #5c5c61; | |
| --ma-blue: #007AFF; | |
| --ma-blue-dim: rgba(0,122,255,0.15); | |
| --ma-bubble-user: #007AFF; | |
| --ma-bubble-bot: #1c1d22; | |
| --ma-glass-bg: rgba(30,31,40,0.75); | |
| --ma-glass-bd: rgba(255,255,255,0.10); | |
| --ma-hero-bg: linear-gradient(160deg,#07080e 0%,#0e1124 55%,#060810 100%); | |
| --ma-r: 24px; | |
| --ma-r-sm: 14px; | |
| --ma-r-xs: 8px; | |
| --ma-r-pill: 999px; | |
| --ma-shadow: 0 4px 32px rgba(0,0,0,0.70); | |
| --ma-shadow-hero: 0 8px 56px rgba(0,0,0,0.85); | |
| --ma-shadow-card: 0 2px 24px rgba(0,0,0,0.60); | |
| --ma-shadow-input: 0 10px 30px -10px rgba(0,0,0,0.60); | |
| --ma-shadow-btn: 0 4px 18px rgba(0,122,255,0.45); | |
| } | |
| /* hide legacy toggles / Gradio footer */ | |
| #theme-toggle,#sl-theme-btn,.show-api,.built-with, | |
| footer,.gradio-container>.footer,.svelte-1ipelgc { display:none !important; } | |
| html,body,.gradio-container,#root { | |
| background: var(--ma-bg) !important; | |
| color: var(--ma-text) !important; | |
| font-family: 'SF Pro Display','SF Pro Text',-apple-system,BlinkMacSystemFont, | |
| 'Inter','Helvetica Neue',Arial,sans-serif !important; | |
| font-size: 16px; line-height: 1.5; | |
| -webkit-font-smoothing: antialiased; | |
| } | |
| .gradio-container { | |
| max-width: 860px !important; | |
| margin: 0 auto !important; | |
| padding: 20px 16px 56px !important; | |
| } | |
| .gradio-container * { color: var(--ma-text) !important; } | |
| /* ── Hero ─────────────────────────────────────────────────────── */ | |
| .ma-hero { | |
| background: var(--ma-hero-bg) !important; | |
| border: 1px solid var(--ma-border); border-radius: var(--ma-r); | |
| padding: 52px 40px 40px; margin-bottom: 16px; | |
| text-align: center; box-shadow: var(--ma-shadow-hero); | |
| position: relative; overflow: hidden; | |
| } | |
| .ma-hero::before { | |
| content:''; position:absolute; inset:0; | |
| background: radial-gradient(ellipse 80% 50% at 50% -10%,rgba(0,122,255,0.22) 0%,transparent 65%); | |
| pointer-events: none; | |
| } | |
| .ma-hero h1 { | |
| font-size: 2.5rem !important; font-weight: 800 !important; | |
| letter-spacing: -0.03em !important; color: #ffffff !important; | |
| margin-bottom: 10px; position: relative; | |
| } | |
| .ma-desc { | |
| font-size: 0.97rem; color: rgba(226,226,231,0.65) !important; | |
| line-height: 1.75; margin-bottom: 26px; position: relative; | |
| } | |
| .ma-desc strong { color: #ffffff !important; } | |
| /* ── Badges ───────────────────────────────────────────────────── */ | |
| .ma-badges { display:flex; gap:8px; justify-content:center; flex-wrap:wrap; position:relative; } | |
| .ma-badge { | |
| display:inline-flex; align-items:center; gap:6px; | |
| padding:6px 16px; border-radius:var(--ma-r-pill); | |
| font-size:0.68rem; font-weight:700; text-transform:uppercase; letter-spacing:0.10em; | |
| border:1px solid; backdrop-filter:blur(8px); -webkit-backdrop-filter:blur(8px); | |
| } | |
| .ma-badge-online { | |
| background:rgba(52,199,89,0.14) !important; color:#34c759 !important; | |
| border-color:rgba(52,199,89,0.30); animation:ma-pulse-glow 2.8s ease-in-out infinite; | |
| } | |
| .ma-badge-learn { | |
| background:rgba(0,122,255,0.14) !important; color:#5ac8fa !important; | |
| border-color:rgba(0,122,255,0.28); | |
| } | |
| .ma-badge-research { | |
| background:rgba(255,159,10,0.12) !important; color:#ff9f0a !important; | |
| border-color:rgba(255,159,10,0.28); | |
| } | |
| /* ── Cards ────────────────────────────────────────────────────── */ | |
| .gradio-container .gradio-group, | |
| .gradio-container .gradio-box, | |
| .gradio-container .block, | |
| .gradio-container .panel, | |
| .gradio-container form { | |
| background:var(--ma-surface) !important; | |
| border:1px solid var(--ma-border) !important; | |
| border-radius:var(--ma-r) !important; | |
| box-shadow:var(--ma-shadow-card) !important; | |
| transition:background 0.25s; | |
| } | |
| .gradio-container .gradio-group .gradio-group, | |
| .gradio-container .block .block { | |
| background:var(--ma-surface2) !important; border-radius:var(--ma-r-sm) !important; | |
| } | |
| /* ── Chatbot ──────────────────────────────────────────────────── */ | |
| .gradio-chatbot,.gradio-chatbot>div { | |
| background:var(--ma-surface) !important; | |
| border:1px solid var(--ma-border) !important; | |
| border-radius:var(--ma-r) !important; | |
| box-shadow:var(--ma-shadow-card) !important; | |
| } | |
| .gradio-chatbot .message-wrap,.gradio-chatbot .messages { | |
| display:flex !important; flex-direction:column !important; | |
| gap:4px !important; padding:16px !important; | |
| } | |
| .gradio-chatbot .message,[data-testid="user"],[data-testid="bot"] { | |
| animation:ma-slide-up 0.28s cubic-bezier(0.34,1.20,0.64,1) both !important; | |
| border:none !important; padding:12px 18px !important; | |
| font-size:16px !important; line-height:1.45 !important; | |
| margin-bottom:2px !important; max-width:85% !important; | |
| position:relative; word-break:break-word; | |
| } | |
| .gradio-chatbot .message.user,[data-testid="user"] { | |
| background:var(--ma-bubble-user) !important; color:#ffffff !important; | |
| border-radius:20px 20px 4px 20px !important; | |
| align-self:flex-end !important; margin-left:auto !important; | |
| box-shadow:0 2px 14px rgba(0,122,255,0.40) !important; | |
| } | |
| .gradio-chatbot .message.user *,[data-testid="user"] * { color:#ffffff !important; } | |
| .gradio-chatbot .message.bot,[data-testid="bot"] { | |
| background:var(--ma-bubble-bot) !important; color:var(--ma-text) !important; | |
| border-radius:20px 20px 20px 4px !important; | |
| align-self:flex-start !important; margin-right:auto !important; | |
| border:1px solid var(--ma-border) !important; | |
| box-shadow:0 2px 12px rgba(0,0,0,0.35) !important; | |
| } | |
| .gradio-chatbot .message.bot *,[data-testid="bot"] * { color:var(--ma-text) !important; } | |
| /* blockquotes */ | |
| .gradio-chatbot .message.bot blockquote,[data-testid="bot"] blockquote { | |
| border-left:3px solid var(--ma-blue) !important; | |
| padding:10px 16px !important; margin:10px 0 !important; | |
| background:var(--ma-blue-dim) !important; | |
| border-radius:0 var(--ma-r-xs) var(--ma-r-xs) 0 !important; | |
| } | |
| .gradio-chatbot .message.bot blockquote *,[data-testid="bot"] blockquote * { | |
| color:var(--ma-text) !important; | |
| } | |
| /* tables — no vertical borders */ | |
| .gradio-chatbot .message.bot table,[data-testid="bot"] table { | |
| border-collapse:collapse !important; width:100% !important; | |
| margin:12px 0 !important; border:none !important; | |
| } | |
| .gradio-chatbot .message.bot th,[data-testid="bot"] th { | |
| background:var(--ma-surface3) !important; color:var(--ma-text2) !important; | |
| padding:9px 14px !important; font-weight:600 !important; font-size:0.78em !important; | |
| text-transform:uppercase; letter-spacing:0.07em; | |
| border:none !important; border-bottom:1px solid var(--ma-separator) !important; | |
| } | |
| .gradio-chatbot .message.bot td,[data-testid="bot"] td { | |
| background:transparent !important; color:var(--ma-text) !important; | |
| padding:9px 14px !important; | |
| border:none !important; border-bottom:1px solid var(--ma-separator) !important; | |
| font-size:0.93em !important; | |
| } | |
| .gradio-chatbot .message.bot tr:last-child td,[data-testid="bot"] tr:last-child td { | |
| border-bottom:none !important; | |
| } | |
| .gradio-chatbot .message.bot th *,.gradio-chatbot .message.bot td *, | |
| [data-testid="bot"] th *,[data-testid="bot"] td * { color:var(--ma-text) !important; } | |
| /* code */ | |
| .gradio-chatbot .message.bot code,[data-testid="bot"] code { | |
| background:var(--ma-surface3) !important; color:var(--ma-text) !important; | |
| padding:2px 7px !important; border-radius:var(--ma-r-xs) !important; | |
| font-size:0.85em !important; border:1px solid var(--ma-border) !important; | |
| } | |
| .gradio-chatbot .message.bot pre,[data-testid="bot"] pre { | |
| background:#0b0b0f !important; border:1px solid var(--ma-border) !important; | |
| border-radius:var(--ma-r-sm) !important; padding:16px !important; margin:10px 0 !important; | |
| } | |
| .gradio-chatbot .message.bot pre code,[data-testid="bot"] pre code { | |
| background:transparent !important; border:none !important; | |
| } | |
| .gradio-chatbot .message.bot a,[data-testid="bot"] a { color:#5ac8fa !important; } | |
| .gradio-chatbot .message.bot hr,[data-testid="bot"] hr { | |
| border:none !important; border-top:1px solid var(--ma-separator) !important; margin:14px 0 !important; | |
| } | |
| /* ── Input bar ────────────────────────────────────────────────── */ | |
| textarea,input[type="text"],input[type="search"] { | |
| background:var(--ma-surface2) !important; | |
| border:1.5px solid var(--ma-border) !important; | |
| border-radius:var(--ma-r-pill) !important; | |
| color:var(--ma-text) !important; | |
| font-family:inherit !important; font-size:16px !important; padding:14px 24px !important; | |
| caret-color:var(--ma-blue) !important; | |
| box-shadow:var(--ma-shadow-input) !important; | |
| transition:border-color 0.25s cubic-bezier(0.4,0,0.2,1), | |
| box-shadow 0.25s cubic-bezier(0.4,0,0.2,1) !important; | |
| } | |
| textarea:focus,input[type="text"]:focus,input[type="search"]:focus { | |
| border-color:var(--ma-border-focus) !important; | |
| box-shadow:0 0 0 3px rgba(0,122,255,0.20),var(--ma-shadow-input) !important; | |
| outline:none !important; | |
| } | |
| textarea::placeholder,input::placeholder { color:var(--ma-text3) !important; } | |
| /* ── Labels ───────────────────────────────────────────────────── */ | |
| label,.label-wrap,.block-label,.form-label, | |
| .gradio-radio label,.gradio-radio span { | |
| color:var(--ma-text2) !important; | |
| font-weight:600; font-size:0.82rem; text-transform:uppercase; letter-spacing:0.07em; | |
| } | |
| /* ── Dropdowns ────────────────────────────────────────────────── */ | |
| .gradio-dropdown,[data-testid="dropdown"] { | |
| background:var(--ma-glass-bg) !important; | |
| border:1px solid var(--ma-glass-bd) !important; | |
| border-radius:var(--ma-r-pill) !important; | |
| backdrop-filter:blur(12px) !important; -webkit-backdrop-filter:blur(12px) !important; | |
| color:var(--ma-text) !important; transition:border-color 0.2s; | |
| } | |
| .gradio-dropdown:focus-within,[data-testid="dropdown"]:focus-within { | |
| border-color:var(--ma-border-focus) !important; | |
| box-shadow:0 0 0 3px rgba(0,122,255,0.15) !important; | |
| } | |
| .gradio-dropdown *,[data-testid="dropdown"] * { | |
| background:transparent !important; color:var(--ma-text) !important; | |
| } | |
| .gradio-dropdown>div,[data-testid="dropdown"]>div { | |
| background:var(--ma-glass-bg) !important; border-radius:var(--ma-r-pill) !important; | |
| } | |
| [role="listbox"],.gradio-dropdown ul,.gradio-dropdown .options { | |
| background:var(--ma-surface) !important; | |
| border:1px solid var(--ma-border) !important; | |
| border-radius:var(--ma-r-sm) !important; | |
| box-shadow:var(--ma-shadow) !important; | |
| backdrop-filter:blur(20px) !important; -webkit-backdrop-filter:blur(20px) !important; | |
| overflow:hidden; | |
| } | |
| [role="option"],.gradio-dropdown li { | |
| background:transparent !important; color:var(--ma-text) !important; | |
| padding:11px 20px !important; font-size:0.95rem; cursor:pointer; | |
| border-bottom:1px solid var(--ma-separator) !important; transition:background 0.12s; | |
| } | |
| [role="option"]:last-child,.gradio-dropdown li:last-child { border-bottom:none !important; } | |
| [role="option"]:hover,.gradio-dropdown li:hover, | |
| [role="option"][aria-selected="true"] { background:var(--ma-surface3) !important; } | |
| /* ── Radio ────────────────────────────────────────────────────── */ | |
| .gradio-radio span,.gradio-radio label { color:var(--ma-text) !important; } | |
| /* ── Buttons ──────────────────────────────────────────────────── */ | |
| button.primary,.gr-button-primary,[data-testid="submit-btn"] { | |
| border-radius:var(--ma-r-pill) !important; padding:14px 28px !important; | |
| background:var(--ma-blue) !important; border:none !important; | |
| color:#ffffff !important; font-family:inherit !important; | |
| font-weight:600 !important; font-size:16px !important; | |
| box-shadow:var(--ma-shadow-btn) !important; | |
| transition:transform 0.2s cubic-bezier(0.4,0,0.2,1), | |
| filter 0.2s cubic-bezier(0.4,0,0.2,1) !important; | |
| } | |
| button.primary:hover,.gr-button-primary:hover { | |
| transform:scale(1.02) !important; filter:brightness(1.10) !important; | |
| } | |
| button.primary:active { transform:scale(0.97) !important; } | |
| button.primary * { color:#ffffff !important; } | |
| button.secondary,.gr-button-secondary { | |
| border-radius:var(--ma-r-pill) !important; padding:12px 24px !important; | |
| background:var(--ma-glass-bg) !important; | |
| border:1px solid var(--ma-glass-bd) !important; | |
| color:var(--ma-text) !important; font-family:inherit !important; font-weight:500 !important; | |
| backdrop-filter:blur(12px) !important; -webkit-backdrop-filter:blur(12px) !important; | |
| transition:background 0.2s cubic-bezier(0.4,0,0.2,1), | |
| transform 0.2s cubic-bezier(0.4,0,0.2,1) !important; | |
| } | |
| button.secondary:hover { | |
| background:var(--ma-surface3) !important; transform:scale(1.02) !important; | |
| } | |
| button.secondary:active { transform:scale(0.98) !important; } | |
| button.secondary * { color:var(--ma-text) !important; } | |
| /* ── Examples table ───────────────────────────────────────────── */ | |
| .gradio-dataframe,.gradio-dataset { | |
| background:var(--ma-surface) !important; | |
| border:1px solid var(--ma-border) !important; | |
| border-radius:var(--ma-r) !important; | |
| overflow:hidden !important; box-shadow:var(--ma-shadow-card) !important; | |
| } | |
| .gradio-dataframe label,.gradio-dataset label, | |
| .gradio-dataframe .label,.gradio-dataset .label { | |
| color:var(--ma-text2) !important; font-weight:700; font-size:0.78rem; | |
| text-transform:uppercase; letter-spacing:0.08em; | |
| padding:14px 20px 8px !important; display:block; | |
| } | |
| .gradio-dataframe table thead tr th, | |
| .gradio-dataset table thead tr th { | |
| background:var(--ma-surface2) !important; color:var(--ma-text2) !important; | |
| font-weight:600 !important; font-size:0.76rem !important; | |
| padding:10px 16px !important; | |
| border:none !important; border-bottom:1px solid var(--ma-separator) !important; | |
| text-transform:uppercase; letter-spacing:0.06em; white-space:nowrap; | |
| } | |
| .gradio-dataframe table tbody tr td, | |
| .gradio-dataset table tbody tr td { | |
| background:transparent !important; color:var(--ma-text) !important; | |
| padding:10px 16px !important; | |
| border:none !important; border-bottom:1px solid var(--ma-separator) !important; | |
| font-size:0.93rem !important; | |
| } | |
| .gradio-dataframe table tbody tr:last-child td, | |
| .gradio-dataset table tbody tr:last-child td { border-bottom:none !important; } | |
| .gradio-dataframe table tbody tr:hover td, | |
| .gradio-dataset table tbody tr:hover td { | |
| background:var(--ma-surface3) !important; cursor:pointer; | |
| } | |
| .gradio-dataframe table *,.gradio-dataset table * { color:var(--ma-text) !important; } | |
| .gradio-dataframe table,.gradio-dataset table { | |
| border-collapse:collapse !important; width:100% !important; | |
| } | |
| /* ── Mono export box ──────────────────────────────────────────── */ | |
| .plain-text-box textarea { | |
| font-family:'SF Mono','Fira Code','JetBrains Mono',monospace !important; | |
| font-size:12px !important; line-height:1.65 !important; | |
| background:#0b0b0f !important; border-radius:var(--ma-r-sm) !important; | |
| color:var(--ma-text) !important; border-color:var(--ma-border) !important; | |
| } | |
| /* ── Markdown ─────────────────────────────────────────────────── */ | |
| .gradio-markdown,.gradio-markdown * { color:var(--ma-text) !important; } | |
| .gradio-markdown a { color:#5ac8fa !important; } | |
| .gradio-markdown code { | |
| background:var(--ma-surface2) !important; color:var(--ma-text) !important; | |
| border:1px solid var(--ma-border) !important; padding:2px 7px; border-radius:var(--ma-r-xs); | |
| } | |
| .gradio-container .info,.gradio-container .description { | |
| color:var(--ma-text3) !important; font-size:0.83rem; | |
| } | |
| .gradio-container h3,.gradio-container h4 { | |
| font-weight:600 !important; color:var(--ma-text) !important; letter-spacing:-0.01em; | |
| } | |
| /* ── Footer ───────────────────────────────────────────────────── */ | |
| .ma-footer { | |
| text-align:center; padding:24px 0 8px; | |
| font-size:0.78rem; color:var(--ma-text3) !important; | |
| } | |
| .ma-footer a { color:var(--ma-text2) !important; text-decoration:none; } | |
| .ma-footer a:hover { color:#5ac8fa !important; } | |
| /* ── Scrollbars ───────────────────────────────────────────────── */ | |
| ::-webkit-scrollbar { width:6px; height:6px; } | |
| ::-webkit-scrollbar-track { background:transparent; } | |
| ::-webkit-scrollbar-thumb { background:#3a3a3c; border-radius:10px; } | |
| ::-webkit-scrollbar-thumb:hover { background:#636366; } | |
| """ | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 28. GRADIO THEME | |
| # FIX: use plain strings only in font= list. | |
| # Mixing gr.themes.GoogleFont objects with plain strings triggers | |
| # AttributeError: 'str' object has no attribute 'name' in Gradio 6. | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| _GRADIO_THEME = gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="slate", | |
| neutral_hue="slate", | |
| font=[ | |
| "SF Pro Display", | |
| "-apple-system", | |
| "BlinkMacSystemFont", | |
| "Inter", | |
| "sans-serif", | |
| ], | |
| font_mono=[ | |
| "SF Mono", | |
| "Fira Code", | |
| "monospace", | |
| ], | |
| ).set( | |
| body_text_size="16px", | |
| body_background_fill="#0b0b0f", | |
| body_text_color="#e2e2e7", | |
| button_primary_background_fill="#007AFF", | |
| button_primary_background_fill_hover="#0071e3", | |
| button_primary_text_color="#ffffff", | |
| input_background_fill="#1a1b22", | |
| input_border_color="rgba(255,255,255,0.08)", | |
| input_border_width="1.5px", | |
| block_background_fill="#13141a", | |
| block_border_color="rgba(255,255,255,0.08)", | |
| block_border_width="1px", | |
| block_radius="24px", | |
| block_shadow="0 4px 32px rgba(0,0,0,0.70)", | |
| section_header_text_weight="600", | |
| ) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 29. UI BUILDER | |
| # Gradio 6 requirements: | |
| # - theme + css passed to launch(), NOT gr.Blocks() | |
| # - bubble_full_width removed from gr.Chatbot() | |
| # - font list uses plain strings only (no GoogleFont mix) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def _create_app() -> gr.Blocks: | |
| with gr.Blocks(title="AI Sikh Librarian") as demo: | |
| with gr.Column(elem_id="main-content"): | |
| gr.HTML(""" | |
| <div class="ma-hero"> | |
| <h1>📚 AI Sikh Librarian</h1> | |
| <p class="ma-desc"> | |
| Scholarly research tool · | |
| 758M+ words of Sikh scriptures, history & manuscripts<br> | |
| <strong>Qwen2.5-72B</strong> · | |
| Hybrid BM25 + FAISS · | |
| Every passage · Chicago citations | |
| </p> | |
| <div class="ma-badges"> | |
| <span class="ma-badge ma-badge-online">● Online</span> | |
| <span class="ma-badge ma-badge-learn">📖 Learn Mode</span> | |
| <span class="ma-badge ma-badge-research">🔍 Research Mode</span> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(equal_height=True): | |
| mode_radio = gr.Radio( | |
| choices=["Learn", "Research"], | |
| value="Learn", | |
| label="Mode", | |
| info="Learn: Qwen2.5-72B scholarly Q&A | Research: every passage, no LLM", | |
| ) | |
| category_dd = gr.Dropdown( | |
| choices=CATEGORY_OPTIONS, | |
| value="All", | |
| label="Filter by Category", | |
| ) | |
| with _source_facets_lock: | |
| initial_sources = list(_source_facets) | |
| source_dd = gr.Dropdown( | |
| choices=initial_sources, | |
| value="All Sources", | |
| label="Filter by Source (granth / manuscript)", | |
| visible=False, | |
| ) | |
| chatbot = gr.Chatbot( | |
| value=[], | |
| height=640, | |
| show_label=False, | |
| render_markdown=True, | |
| ) | |
| with gr.Row(): | |
| query_box = gr.Textbox( | |
| placeholder="Message Sikh Librarian\u2026", | |
| show_label=False, | |
| container=False, | |
| scale=9, | |
| lines=1, | |
| max_lines=5, | |
| ) | |
| submit_btn = gr.Button("Send", variant="primary", scale=1, min_width=90) | |
| clear_btn = gr.Button("Clear conversation", variant="secondary", size="sm") | |
| with gr.Group(visible=False) as export_group: | |
| gr.Markdown("### Export Results") | |
| plain_text_box = gr.Textbox( | |
| label="Plain text \u2014 copy or download", | |
| lines=12, interactive=False, | |
| elem_classes=["plain-text-box"], | |
| ) | |
| download_btn = gr.DownloadButton( | |
| label="Download .txt", visible=False, | |
| variant="secondary", size="sm", | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["Learn", "All", "All Sources", | |
| "What does Guru Granth Sahib teach about mental health and inner peace?"], | |
| ["Learn", "All", "All Sources", | |
| "Explain the concept of sant-sipahi in Sikh thought"], | |
| ["Learn", "Gurbani", "All Sources", | |
| "What is the significance of amrit vela in Gurbani?"], | |
| ["Research", "All", "All Sources", "naam simran"], | |
| ["Research", "Gurbani", "All Sources", "waheguru naam simran meditation"], | |
| ["Research", "Granths", "All Sources", "Khalsa identity sovereignty"], | |
| ["Research", "Steeks", "All Sources", "japji sahib commentary"], | |
| ["Research", "Literature", "All Sources", "martyrdom sacrifice shaheedi"], | |
| ], | |
| inputs=[mode_radio, category_dd, source_dd, query_box], | |
| label="Example queries \u2014 click to load, then press Send", | |
| ) | |
| gr.HTML(""" | |
| <div class="ma-footer"> | |
| <a href="https://huggingface.co/datasets/jsdosanj/SikhLibrary" | |
| target="_blank" rel="noopener noreferrer">SikhLibrary Digital Archive</a> | |
| · Qwen2.5-72B-Instruct · | |
| Hybrid BM25 + FAISS · | |
| Chicago Manual of Style 17th ed. · CC BY-NC-ND 4.0 | |
| </div> | |
| """) | |
| sub_in = [query_box, chatbot, mode_radio, category_dd, source_dd] | |
| sub_out = [chatbot, plain_text_box, download_btn] | |
| query_box.submit( | |
| fn=submit_query, inputs=sub_in, outputs=sub_out, | |
| ).then(fn=lambda: "", outputs=query_box) | |
| submit_btn.click( | |
| fn=submit_query, inputs=sub_in, outputs=sub_out, | |
| ).then(fn=lambda: "", outputs=query_box) | |
| clear_btn.click( | |
| fn=clear_conversation, | |
| outputs=[chatbot, plain_text_box, download_btn], | |
| ) | |
| mode_radio.change( | |
| fn=toggle_mode_ui, | |
| inputs=mode_radio, | |
| outputs=[export_group, download_btn, query_box, source_dd], | |
| ) | |
| demo.load(fn=get_source_choices, outputs=source_dd) | |
| return demo | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # 30. STARTUP | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| print("\U0001f680 Starting AI Sikh Librarian v22 \u2014 Midnight Archive Edition") | |
| print(f" FREE_TIER : {FREE_TIER}") | |
| print(f" MMR reranking : {'disabled' if FREE_TIER else 'enabled (CPU Upgrade)'}") | |
| print(f" Per-source cap : {PER_SOURCE_CAP} passages") | |
| print(f" langdetect : {'available' if _LANGDETECT_AVAILABLE else 'not installed'}") | |
| print(f" EMBED_CACHE_SIZE : {EMBED_CACHE_SIZE:,}") | |
| _init_llm_client() | |
| demo = _create_app() | |
| demo.queue( | |
| max_size=100 if not FREE_TIER else 20, | |
| default_concurrency_limit=1, | |
| ) | |
| _index_thread = threading.Thread(target=_load_index_background, daemon=True) | |
| _index_thread.start() | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| ssr_mode=False, | |
| show_error=False, | |
| max_threads=40 if not FREE_TIER else 8, | |
| theme=_GRADIO_THEME, # Gradio 6: theme in launch(), not gr.Blocks() | |
| css=CUSTOM_CSS, # Gradio 6: css in launch(), not gr.Blocks() | |
| ) |