Spaces:

TutuAwad
/

HarmoniFind

Sleeping

App Files Files Community

TutuAwad commited on 19 days ago

Commit

4f22c4e

verified ·

1 Parent(s): 0bde887

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -20

app.py CHANGED Viewed

@@ -18,6 +18,9 @@ from sentence_transformers import SentenceTransformer
 from huggingface_hub import InferenceClient
 import spotipy
 from spotipy.oauth2 import SpotifyClientCredentials
 # ---------- Paths to precomputed data ----------
@@ -49,16 +52,60 @@ print("Spotify secret present?", bool(SPOTIFY_CLIENT_SECRET))
 # Query encoder (same as notebook)
 query_embedder = SentenceTransformer("all-mpnet-base-v2")
-# LLaMA-2 for query expansion (remote HF Inference)
 LLAMA_MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
-hf_client = None
 if HF_TOKEN:
-    try:
-        hf_client = InferenceClient(token=HF_TOKEN)
-    except Exception as e:
-        print("⚠️ Could not initialize HF Inference client:", repr(e))
-        hf_client = None
 # Spotify client
 sp = None
@@ -82,12 +129,14 @@ def encode_query(text: str) -> np.ndarray:
 def expand_with_llama(query: str) -> str:
     """
-    Enrich the query using LLaMA via HF Inference.
-    If anything fails (no client, provider issues, rate limits, etc.),
-    we log and fall back to the raw query so the app keeps working.
     """
-    if hf_client is None or not HF_TOKEN:
         return query
     prompt = f"""You are helping someone search a lyrics catalog.
@@ -104,22 +153,42 @@ Input:
 Output (no explanation, just titles or keywords):"""
     try:
-        response = hf_client.text_generation(
-            prompt,
-            model=LLAMA_MODEL_ID,
-            max_new_tokens=96,
-            temperature=0.2,
-            repetition_penalty=1.05,
-        )
     except Exception as e:
-        print("⚠️ LLaMA expansion failed on HF, using raw query:", repr(e))
         return query
-    keywords = str(response).strip().replace("\n", " ")
     expanded = query + " " + keywords
     return expanded
 def distances_to_similarity_pct(dists: np.ndarray) -> np.ndarray:
     if len(dists) == 0:
         return np.array([])

 from huggingface_hub import InferenceClient
 import spotipy
 from spotipy.oauth2 import SpotifyClientCredentials
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
 # ---------- Paths to precomputed data ----------
 # Query encoder (same as notebook)
 query_embedder = SentenceTransformer("all-mpnet-base-v2")
+# LLaMA-2 for query expansion
 LLAMA_MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
+llama_pipe = None    # local quantized pipeline (preferred)
+hf_client = None     # hosted fallback
 if HF_TOKEN:
+    # Try to load a 4-bit quantized LLaMA locally (for HF Space with GPU)
+    if torch.cuda.is_available():
+        try:
+            print(" Loading LLaMA-2-7B in 4-bit NF4 with bitsandbytes...")
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+            )
+            llama_tokenizer = AutoTokenizer.from_pretrained(
+                LLAMA_MODEL_ID,
+                use_auth_token=HF_TOKEN,
+            )
+            llama_model = AutoModelForCausalLM.from_pretrained(
+                LLAMA_MODEL_ID,
+                quantization_config=bnb_config,  # 🔑 this actually activates 4-bit
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                use_auth_token=HF_TOKEN,
+            )
+            llama_pipe = pipeline(
+                "text-generation",
+                model=llama_model,
+                tokenizer=llama_tokenizer,
+                max_new_tokens=96,
+                temperature=0.2,
+                top_p=0.9,
+                repetition_penalty=1.05,
+            )
+            print(" Using local 4-bit quantized LLaMA backend.")
+        except Exception as e:
+            print("⚠️ Quantized LLaMA load failed, will try HF Inference fallback:", repr(e))
+    # If quantized local load failed (or no CUDA), fall back to HF hosted inference
+    if llama_pipe is None:
+        try:
+            hf_client = InferenceClient(model=LLAMA_MODEL_ID, token=HF_TOKEN)
+            print("✅ Using HF InferenceClient backend (hosted LLaMA).")
+        except Exception as e:
+            print("⚠️ Could not initialize any LLaMA backend:", repr(e))
+else:
+    print("⚠️ No HF_TOKEN found; LLaMA expansion will be disabled.")
 # Spotify client
 sp = None
 def expand_with_llama(query: str) -> str:
     """
+    Enrich the query using LLaMA.
+    Priority:
+    1) Use local 4-bit quantized LLaMA pipeline if available (HF Space with GPU).
+    2) Otherwise, fall back to HF InferenceClient (hosted model).
+    3) On any failure, return the raw query so the app keeps working.
     """
+    if not HF_TOKEN:
         return query
     prompt = f"""You are helping someone search a lyrics catalog.
 Output (no explanation, just titles or keywords):"""
     try:
+        if llama_pipe is not None:
+            # Local 4-bit quantized model on HF Space
+            outputs = llama_pipe(
+                prompt,
+                do_sample=True,
+                num_return_sequences=1,
+            )
+            full_text = outputs[0]["generated_text"]
+            # Strip the prompt off the front if it's included
+            if full_text.startswith(prompt):
+                keywords = full_text[len(prompt):].strip()
+            else:
+                keywords = full_text.strip()
+        elif hf_client is not None:
+            # Hosted HF Inference fallback
+            response = hf_client.text_generation(
+                prompt,
+                max_new_tokens=96,
+                temperature=0.2,
+                repetition_penalty=1.05,
+            )
+            keywords = str(response).strip()
+        else:
+            # No backend at all
+            return query
     except Exception as e:
+        print("⚠️ LLaMA expansion failed, using raw query:", repr(e))
         return query
+    keywords = keywords.replace("\n", " ")
     expanded = query + " " + keywords
     return expanded
 def distances_to_similarity_pct(dists: np.ndarray) -> np.ndarray:
     if len(dists) == 0:
         return np.array([])