Spaces:

Luka512
/

website

Running

App Files Files Community

Tim Luka Horstmann commited on May 27

Commit

ae2bc6b

1 Parent(s): e8ba1ec

Enable gemini

Browse files

Files changed (3) hide show

app.py +134 -36
requirements.txt +2 -1
test_gemini_integration.py +111 -0

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ import os
 import faiss
 import asyncio
 import psutil  # Added for RAM tracking
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -31,10 +33,24 @@ if not hf_token:
 login(token=hf_token)
 # Models Configuration
 sentence_transformer_model = "all-MiniLM-L6-v2"
 repo_id = "unsloth/Qwen3-1.7B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
 filename = "Qwen3-1.7B-Q4_K_M.gguf" # "deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf"
 # Define FAQs
 faqs = [
     {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
@@ -68,27 +84,31 @@ try:
     faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32")
     faiss.normalize_L2(faq_embeddings)
-    # Load the 8B Cogito model with optimized parameters
-    logger.info(f"Loading {filename} model")
-    model_path = hf_hub_download(
-        repo_id=repo_id,
-        filename=filename,
-        local_dir="/app/cache" if os.getenv("HF_HOME") else None,
-        token=hf_token,
-    )
-    generator = Llama(
-        model_path=model_path,
-        n_ctx=3072,
-        n_threads=2,
-        n_batch=64,
-        n_gpu_layers=0,
-        use_mlock=True,
-        f16_kv=True,
-        verbose=True,
-        batch_prefill=True,
-        prefill_logits=False,
-    )
-    logger.info(f"{filename} model loaded")
 except Exception as e:
     logger.error(f"Startup error: {str(e)}", exc_info=True)
@@ -117,7 +137,70 @@ except Exception as e:
     raise
 async def stream_response(query, history):
-    logger.info(f"Processing query: {query}")
     start_time = time.time()
     first_token_logged = False
@@ -128,7 +211,7 @@ async def stream_response(query, history):
         "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
         "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
         "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
-        f"Today’s date is {current_date}. "
         f"CV: {full_cv_text}"
     )
@@ -171,7 +254,7 @@ async def stream_response(query, history):
                 token = chunk['choices'][0]['delta'].get('content', '')
                 if token:
                     if not first_token_logged:
-                        logger.info(f"First token time: {time.time() - start_time:.2f}s")
                         first_token_logged = True
                     yield f"data: {token}\n\n"
             yield "data: [DONE]\n\n"
@@ -210,14 +293,26 @@ async def health_check():
 @app.get("/model_info")
 async def model_info():
-    return {
-        "model_name": "deepcogito_cogito-v1-preview-llama-8B-GGUF",
-        "model_size": "8B",
-        "quantization": "Q4_K_M",
-        "embedding_model": sentence_transformer_model,
-        "faiss_index_size": len(cv_chunks),
-        "faiss_index_dim": cv_embeddings.shape[1],
-    }
 @app.get("/ram_usage")
 async def ram_usage():
@@ -244,14 +339,17 @@ async def ram_usage():
 # Add a background task to keep the model warm
 @app.on_event("startup")
 async def setup_periodic_tasks():
-    asyncio.create_task(keep_model_warm())
-    logger.info("Periodic model warm-up task scheduled")
 async def keep_model_warm():
-    """Background task that keeps the model warm by sending periodic requests"""
     while True:
         try:
-            logger.info("Performing periodic model warm-up")
             dummy_query = "Say only the word 'ok.'"
             dummy_history = []
             # Process a dummy query through the generator to keep it warm

 import faiss
 import asyncio
 import psutil  # Added for RAM tracking
+from google import genai
+from google.genai import types
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 login(token=hf_token)
 # Models Configuration
+USE_GEMINI = os.getenv("USE_GEMINI", "false").lower() == "true"
 sentence_transformer_model = "all-MiniLM-L6-v2"
 repo_id = "unsloth/Qwen3-1.7B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
 filename = "Qwen3-1.7B-Q4_K_M.gguf" # "deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf"
+# Gemini Configuration
+if USE_GEMINI:
+    gemini_api_key = os.getenv("GEMINI_API_KEY")
+    if not gemini_api_key:
+        logger.error("GEMINI_API_KEY environment variable not set but USE_GEMINI is true.")
+        raise ValueError("GEMINI_API_KEY not set")
+    gemini_client = genai.Client(api_key=gemini_api_key)
+    gemini_model = "gemini-2.5-flash-preview-05-20"
+    logger.info("Gemini API client initialized")
+else:
+    gemini_client = None
+    logger.info("Using local model (Gemini disabled)")
 # Define FAQs
 faqs = [
     {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
     faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32")
     faiss.normalize_L2(faq_embeddings)
+    # Load the local model only if not using Gemini
+    if not USE_GEMINI:
+        logger.info(f"Loading {filename} model")
+        model_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            local_dir="/app/cache" if os.getenv("HF_HOME") else None,
+            token=hf_token,
+        )
+        generator = Llama(
+            model_path=model_path,
+            n_ctx=3072,
+            n_threads=2,
+            n_batch=64,
+            n_gpu_layers=0,
+            use_mlock=True,
+            f16_kv=True,
+            verbose=True,
+            batch_prefill=True,
+            prefill_logits=False,
+        )
+        logger.info(f"{filename} model loaded")
+    else:
+        generator = None
+        logger.info("Skipping local model loading (using Gemini API)")
 except Exception as e:
     logger.error(f"Startup error: {str(e)}", exc_info=True)
     raise
 async def stream_response(query, history):
+    """Main streaming response function that routes to either Gemini or local model"""
+    if USE_GEMINI:
+        async for chunk in stream_response_gemini(query, history):
+            yield chunk
+    else:
+        async for chunk in stream_response_local(query, history):
+            yield chunk
+async def stream_response_gemini(query, history):
+    """Stream response using Gemini API"""
+    logger.info(f"Processing query with Gemini: {query}")
+    start_time = time.time()
+    first_token_logged = False
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    system_prompt = (
+        "You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
+        "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
+        "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
+        "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
+        f"Today's date is {current_date}. "
+        f"CV: {full_cv_text}"
+    )
+    # Build messages for Gemini
+    messages = [types.Content(role="system", parts=[types.Part(text=system_prompt)])]
+    # Add conversation history
+    for msg in history:
+        role = "user" if msg["role"] == "user" else "model"
+        messages.append(types.Content(role=role, parts=[types.Part(text=msg["content"])]))
+    # Add current query
+    messages.append(types.Content(role="user", parts=[types.Part(text=query)]))
+    try:
+        response = gemini_client.models.generate_content_stream(
+            model=gemini_model,
+            contents=messages,
+            config=types.GenerateContentConfig(
+                temperature=0.3,
+                top_p=0.7,
+                max_output_tokens=512,
+            )
+        )
+        for chunk in response:
+            if chunk.text:
+                if not first_token_logged:
+                    logger.info(f"First token time (Gemini): {time.time() - start_time:.2f}s")
+                    first_token_logged = True
+                yield f"data: {chunk.text}\n\n"
+        yield "data: [DONE]\n\n"
+    except Exception as e:
+        logger.error(f"Gemini API error: {str(e)}")
+        yield f"data: Sorry, I encountered an error with Gemini API: {str(e)}\n\n"
+        yield "data: [DONE]\n\n"
+async def stream_response_local(query, history):
+    """Stream response using local model"""
+    logger.info(f"Processing query with local model: {query}")
     start_time = time.time()
     first_token_logged = False
         "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
         "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
         "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
+        f"Today's date is {current_date}. "
         f"CV: {full_cv_text}"
     )
                 token = chunk['choices'][0]['delta'].get('content', '')
                 if token:
                     if not first_token_logged:
+                        logger.info(f"First token time (local): {time.time() - start_time:.2f}s")
                         first_token_logged = True
                     yield f"data: {token}\n\n"
             yield "data: [DONE]\n\n"
 @app.get("/model_info")
 async def model_info():
+    if USE_GEMINI:
+        return {
+            "model_type": "gemini",
+            "model_name": gemini_model,
+            "provider": "Google Gemini API",
+            "embedding_model": sentence_transformer_model,
+            "faiss_index_size": len(cv_chunks),
+            "faiss_index_dim": cv_embeddings.shape[1],
+        }
+    else:
+        return {
+            "model_type": "local",
+            "model_name": filename,
+            "repo_id": repo_id,
+            "model_size": "1.7B",
+            "quantization": "Q4_K_M",
+            "embedding_model": sentence_transformer_model,
+            "faiss_index_size": len(cv_chunks),
+            "faiss_index_dim": cv_embeddings.shape[1],
+        }
 @app.get("/ram_usage")
 async def ram_usage():
 # Add a background task to keep the model warm
 @app.on_event("startup")
 async def setup_periodic_tasks():
+    if not USE_GEMINI:  # Only warm up local models
+        asyncio.create_task(keep_model_warm())
+        logger.info("Periodic model warm-up task scheduled for local model")
+    else:
+        logger.info("Gemini API in use - no warm-up needed")
 async def keep_model_warm():
+    """Background task that keeps the local model warm by sending periodic requests"""
     while True:
         try:
+            logger.info("Performing periodic local model warm-up")
             dummy_query = "Say only the word 'ok.'"
             dummy_history = []
             # Process a dummy query through the generator to keep it warm

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ llama-cpp-python==0.3.1
 huggingface_hub==0.30.1
 faiss-cpu==1.8.0
 asyncio
-psutil

 huggingface_hub==0.30.1
 faiss-cpu==1.8.0
 asyncio
+psutil
+google-genai

test_gemini_integration.py ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/usr/bin/env python3
+"""
+Test script for Gemini API integration
+"""
+import os
+import asyncio
+from datetime import datetime
+# Mock the dependencies for testing
+class MockClient:
+    def __init__(self, api_key):
+        self.api_key = api_key
+    class models:
+        @staticmethod
+        def generate_content_stream(model, contents, config):
+            # Mock streaming response
+            class MockChunk:
+                text = "Hello! I'm Tim Luka Horstmann, a Computer Scientist currently pursuing my MSc in Data and AI at Institut Polytechnique de Paris."
+            yield MockChunk()
+class MockTypes:
+    class Content:
+        def __init__(self, role, parts):
+            self.role = role
+            self.parts = parts
+    class Part:
+        def __init__(self, text):
+            self.text = text
+    class GenerateContentConfig:
+        def __init__(self, temperature, top_p, max_output_tokens):
+            self.temperature = temperature
+            self.top_p = top_p
+            self.max_output_tokens = max_output_tokens
+# Test function similar to our Gemini implementation
+async def test_gemini_integration():
+    """Test the Gemini integration logic"""
+    # Mock environment variables
+    USE_GEMINI = True
+    gemini_api_key = "test_api_key"
+    gemini_model = "gemini-2.5-flash-preview-05-20"
+    # Mock full CV text
+    full_cv_text = "Tim Luka Horstmann is a Computer Scientist pursuing MSc in Data and AI at Institut Polytechnique de Paris."
+    # Initialize mock client
+    gemini_client = MockClient(api_key=gemini_api_key)
+    types = MockTypes()
+    # Test query and history
+    query = "What is your education?"
+    history = []
+    print(f"Testing Gemini integration...")
+    print(f"USE_GEMINI: {USE_GEMINI}")
+    print(f"Query: {query}")
+    # Simulate the Gemini function logic
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    system_prompt = (
+        "You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
+        "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
+        "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
+        "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
+        f"Today's date is {current_date}. "
+        f"CV: {full_cv_text}"
+    )
+    # Build messages for Gemini
+    messages = [types.Content(role="system", parts=[types.Part(text=system_prompt)])]
+    # Add conversation history
+    for msg in history:
+        role = "user" if msg["role"] == "user" else "model"
+        messages.append(types.Content(role=role, parts=[types.Part(text=msg["content"])]))
+    # Add current query
+    messages.append(types.Content(role="user", parts=[types.Part(text=query)]))
+    print(f"System prompt length: {len(system_prompt)}")
+    print(f"Number of messages: {len(messages)}")
+    # Mock the streaming response
+    response = gemini_client.models.generate_content_stream(
+        model=gemini_model,
+        contents=messages,
+        config=types.GenerateContentConfig(
+            temperature=0.3,
+            top_p=0.7,
+            max_output_tokens=512,
+        )
+    )
+    print("Streaming response:")
+    for chunk in response:
+        if chunk.text:
+            print(f"Chunk: {chunk.text}")
+    print("✅ Gemini integration test completed successfully!")
+    return True
+if __name__ == "__main__":
+    asyncio.run(test_gemini_integration())