Tim Luka Horstmann commited on
Commit
ae2bc6b
·
1 Parent(s): e8ba1ec

Enable gemini

Browse files
Files changed (3) hide show
  1. app.py +134 -36
  2. requirements.txt +2 -1
  3. test_gemini_integration.py +111 -0
app.py CHANGED
@@ -13,6 +13,8 @@ import os
13
  import faiss
14
  import asyncio
15
  import psutil # Added for RAM tracking
 
 
16
 
17
  # Set up logging
18
  logging.basicConfig(level=logging.INFO)
@@ -31,10 +33,24 @@ if not hf_token:
31
  login(token=hf_token)
32
 
33
  # Models Configuration
 
34
  sentence_transformer_model = "all-MiniLM-L6-v2"
35
  repo_id = "unsloth/Qwen3-1.7B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
36
  filename = "Qwen3-1.7B-Q4_K_M.gguf" # "deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf"
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Define FAQs
39
  faqs = [
40
  {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
@@ -68,27 +84,31 @@ try:
68
  faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32")
69
  faiss.normalize_L2(faq_embeddings)
70
 
71
- # Load the 8B Cogito model with optimized parameters
72
- logger.info(f"Loading {filename} model")
73
- model_path = hf_hub_download(
74
- repo_id=repo_id,
75
- filename=filename,
76
- local_dir="/app/cache" if os.getenv("HF_HOME") else None,
77
- token=hf_token,
78
- )
79
- generator = Llama(
80
- model_path=model_path,
81
- n_ctx=3072,
82
- n_threads=2,
83
- n_batch=64,
84
- n_gpu_layers=0,
85
- use_mlock=True,
86
- f16_kv=True,
87
- verbose=True,
88
- batch_prefill=True,
89
- prefill_logits=False,
90
- )
91
- logger.info(f"{filename} model loaded")
 
 
 
 
92
 
93
  except Exception as e:
94
  logger.error(f"Startup error: {str(e)}", exc_info=True)
@@ -117,7 +137,70 @@ except Exception as e:
117
  raise
118
 
119
  async def stream_response(query, history):
120
- logger.info(f"Processing query: {query}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  start_time = time.time()
122
  first_token_logged = False
123
 
@@ -128,7 +211,7 @@ async def stream_response(query, history):
128
  "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
129
  "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
130
  "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
131
- f"Todays date is {current_date}. "
132
  f"CV: {full_cv_text}"
133
  )
134
 
@@ -171,7 +254,7 @@ async def stream_response(query, history):
171
  token = chunk['choices'][0]['delta'].get('content', '')
172
  if token:
173
  if not first_token_logged:
174
- logger.info(f"First token time: {time.time() - start_time:.2f}s")
175
  first_token_logged = True
176
  yield f"data: {token}\n\n"
177
  yield "data: [DONE]\n\n"
@@ -210,14 +293,26 @@ async def health_check():
210
 
211
  @app.get("/model_info")
212
  async def model_info():
213
- return {
214
- "model_name": "deepcogito_cogito-v1-preview-llama-8B-GGUF",
215
- "model_size": "8B",
216
- "quantization": "Q4_K_M",
217
- "embedding_model": sentence_transformer_model,
218
- "faiss_index_size": len(cv_chunks),
219
- "faiss_index_dim": cv_embeddings.shape[1],
220
- }
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  @app.get("/ram_usage")
223
  async def ram_usage():
@@ -244,14 +339,17 @@ async def ram_usage():
244
  # Add a background task to keep the model warm
245
  @app.on_event("startup")
246
  async def setup_periodic_tasks():
247
- asyncio.create_task(keep_model_warm())
248
- logger.info("Periodic model warm-up task scheduled")
 
 
 
249
 
250
  async def keep_model_warm():
251
- """Background task that keeps the model warm by sending periodic requests"""
252
  while True:
253
  try:
254
- logger.info("Performing periodic model warm-up")
255
  dummy_query = "Say only the word 'ok.'"
256
  dummy_history = []
257
  # Process a dummy query through the generator to keep it warm
 
13
  import faiss
14
  import asyncio
15
  import psutil # Added for RAM tracking
16
+ from google import genai
17
+ from google.genai import types
18
 
19
  # Set up logging
20
  logging.basicConfig(level=logging.INFO)
 
33
  login(token=hf_token)
34
 
35
  # Models Configuration
36
+ USE_GEMINI = os.getenv("USE_GEMINI", "false").lower() == "true"
37
  sentence_transformer_model = "all-MiniLM-L6-v2"
38
  repo_id = "unsloth/Qwen3-1.7B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
39
  filename = "Qwen3-1.7B-Q4_K_M.gguf" # "deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf"
40
 
41
+ # Gemini Configuration
42
+ if USE_GEMINI:
43
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
44
+ if not gemini_api_key:
45
+ logger.error("GEMINI_API_KEY environment variable not set but USE_GEMINI is true.")
46
+ raise ValueError("GEMINI_API_KEY not set")
47
+ gemini_client = genai.Client(api_key=gemini_api_key)
48
+ gemini_model = "gemini-2.5-flash-preview-05-20"
49
+ logger.info("Gemini API client initialized")
50
+ else:
51
+ gemini_client = None
52
+ logger.info("Using local model (Gemini disabled)")
53
+
54
  # Define FAQs
55
  faqs = [
56
  {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
 
84
  faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32")
85
  faiss.normalize_L2(faq_embeddings)
86
 
87
+ # Load the local model only if not using Gemini
88
+ if not USE_GEMINI:
89
+ logger.info(f"Loading {filename} model")
90
+ model_path = hf_hub_download(
91
+ repo_id=repo_id,
92
+ filename=filename,
93
+ local_dir="/app/cache" if os.getenv("HF_HOME") else None,
94
+ token=hf_token,
95
+ )
96
+ generator = Llama(
97
+ model_path=model_path,
98
+ n_ctx=3072,
99
+ n_threads=2,
100
+ n_batch=64,
101
+ n_gpu_layers=0,
102
+ use_mlock=True,
103
+ f16_kv=True,
104
+ verbose=True,
105
+ batch_prefill=True,
106
+ prefill_logits=False,
107
+ )
108
+ logger.info(f"{filename} model loaded")
109
+ else:
110
+ generator = None
111
+ logger.info("Skipping local model loading (using Gemini API)")
112
 
113
  except Exception as e:
114
  logger.error(f"Startup error: {str(e)}", exc_info=True)
 
137
  raise
138
 
139
  async def stream_response(query, history):
140
+ """Main streaming response function that routes to either Gemini or local model"""
141
+ if USE_GEMINI:
142
+ async for chunk in stream_response_gemini(query, history):
143
+ yield chunk
144
+ else:
145
+ async for chunk in stream_response_local(query, history):
146
+ yield chunk
147
+
148
+ async def stream_response_gemini(query, history):
149
+ """Stream response using Gemini API"""
150
+ logger.info(f"Processing query with Gemini: {query}")
151
+ start_time = time.time()
152
+ first_token_logged = False
153
+
154
+ current_date = datetime.now().strftime("%Y-%m-%d")
155
+
156
+ system_prompt = (
157
+ "You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
158
+ "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
159
+ "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
160
+ "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
161
+ f"Today's date is {current_date}. "
162
+ f"CV: {full_cv_text}"
163
+ )
164
+
165
+ # Build messages for Gemini
166
+ messages = [types.Content(role="system", parts=[types.Part(text=system_prompt)])]
167
+
168
+ # Add conversation history
169
+ for msg in history:
170
+ role = "user" if msg["role"] == "user" else "model"
171
+ messages.append(types.Content(role=role, parts=[types.Part(text=msg["content"])]))
172
+
173
+ # Add current query
174
+ messages.append(types.Content(role="user", parts=[types.Part(text=query)]))
175
+
176
+ try:
177
+ response = gemini_client.models.generate_content_stream(
178
+ model=gemini_model,
179
+ contents=messages,
180
+ config=types.GenerateContentConfig(
181
+ temperature=0.3,
182
+ top_p=0.7,
183
+ max_output_tokens=512,
184
+ )
185
+ )
186
+
187
+ for chunk in response:
188
+ if chunk.text:
189
+ if not first_token_logged:
190
+ logger.info(f"First token time (Gemini): {time.time() - start_time:.2f}s")
191
+ first_token_logged = True
192
+ yield f"data: {chunk.text}\n\n"
193
+
194
+ yield "data: [DONE]\n\n"
195
+
196
+ except Exception as e:
197
+ logger.error(f"Gemini API error: {str(e)}")
198
+ yield f"data: Sorry, I encountered an error with Gemini API: {str(e)}\n\n"
199
+ yield "data: [DONE]\n\n"
200
+
201
+ async def stream_response_local(query, history):
202
+ """Stream response using local model"""
203
+ logger.info(f"Processing query with local model: {query}")
204
  start_time = time.time()
205
  first_token_logged = False
206
 
 
211
  "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
212
  "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
213
  "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
214
+ f"Today's date is {current_date}. "
215
  f"CV: {full_cv_text}"
216
  )
217
 
 
254
  token = chunk['choices'][0]['delta'].get('content', '')
255
  if token:
256
  if not first_token_logged:
257
+ logger.info(f"First token time (local): {time.time() - start_time:.2f}s")
258
  first_token_logged = True
259
  yield f"data: {token}\n\n"
260
  yield "data: [DONE]\n\n"
 
293
 
294
  @app.get("/model_info")
295
  async def model_info():
296
+ if USE_GEMINI:
297
+ return {
298
+ "model_type": "gemini",
299
+ "model_name": gemini_model,
300
+ "provider": "Google Gemini API",
301
+ "embedding_model": sentence_transformer_model,
302
+ "faiss_index_size": len(cv_chunks),
303
+ "faiss_index_dim": cv_embeddings.shape[1],
304
+ }
305
+ else:
306
+ return {
307
+ "model_type": "local",
308
+ "model_name": filename,
309
+ "repo_id": repo_id,
310
+ "model_size": "1.7B",
311
+ "quantization": "Q4_K_M",
312
+ "embedding_model": sentence_transformer_model,
313
+ "faiss_index_size": len(cv_chunks),
314
+ "faiss_index_dim": cv_embeddings.shape[1],
315
+ }
316
 
317
  @app.get("/ram_usage")
318
  async def ram_usage():
 
339
  # Add a background task to keep the model warm
340
  @app.on_event("startup")
341
  async def setup_periodic_tasks():
342
+ if not USE_GEMINI: # Only warm up local models
343
+ asyncio.create_task(keep_model_warm())
344
+ logger.info("Periodic model warm-up task scheduled for local model")
345
+ else:
346
+ logger.info("Gemini API in use - no warm-up needed")
347
 
348
  async def keep_model_warm():
349
+ """Background task that keeps the local model warm by sending periodic requests"""
350
  while True:
351
  try:
352
+ logger.info("Performing periodic local model warm-up")
353
  dummy_query = "Say only the word 'ok.'"
354
  dummy_history = []
355
  # Process a dummy query through the generator to keep it warm
requirements.txt CHANGED
@@ -7,4 +7,5 @@ llama-cpp-python==0.3.1
7
  huggingface_hub==0.30.1
8
  faiss-cpu==1.8.0
9
  asyncio
10
- psutil
 
 
7
  huggingface_hub==0.30.1
8
  faiss-cpu==1.8.0
9
  asyncio
10
+ psutil
11
+ google-genai
test_gemini_integration.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for Gemini API integration
4
+ """
5
+
6
+ import os
7
+ import asyncio
8
+ from datetime import datetime
9
+
10
+ # Mock the dependencies for testing
11
+ class MockClient:
12
+ def __init__(self, api_key):
13
+ self.api_key = api_key
14
+
15
+ class models:
16
+ @staticmethod
17
+ def generate_content_stream(model, contents, config):
18
+ # Mock streaming response
19
+ class MockChunk:
20
+ text = "Hello! I'm Tim Luka Horstmann, a Computer Scientist currently pursuing my MSc in Data and AI at Institut Polytechnique de Paris."
21
+
22
+ yield MockChunk()
23
+
24
+ class MockTypes:
25
+ class Content:
26
+ def __init__(self, role, parts):
27
+ self.role = role
28
+ self.parts = parts
29
+
30
+ class Part:
31
+ def __init__(self, text):
32
+ self.text = text
33
+
34
+ class GenerateContentConfig:
35
+ def __init__(self, temperature, top_p, max_output_tokens):
36
+ self.temperature = temperature
37
+ self.top_p = top_p
38
+ self.max_output_tokens = max_output_tokens
39
+
40
+ # Test function similar to our Gemini implementation
41
+ async def test_gemini_integration():
42
+ """Test the Gemini integration logic"""
43
+
44
+ # Mock environment variables
45
+ USE_GEMINI = True
46
+ gemini_api_key = "test_api_key"
47
+ gemini_model = "gemini-2.5-flash-preview-05-20"
48
+
49
+ # Mock full CV text
50
+ full_cv_text = "Tim Luka Horstmann is a Computer Scientist pursuing MSc in Data and AI at Institut Polytechnique de Paris."
51
+
52
+ # Initialize mock client
53
+ gemini_client = MockClient(api_key=gemini_api_key)
54
+ types = MockTypes()
55
+
56
+ # Test query and history
57
+ query = "What is your education?"
58
+ history = []
59
+
60
+ print(f"Testing Gemini integration...")
61
+ print(f"USE_GEMINI: {USE_GEMINI}")
62
+ print(f"Query: {query}")
63
+
64
+ # Simulate the Gemini function logic
65
+ current_date = datetime.now().strftime("%Y-%m-%d")
66
+
67
+ system_prompt = (
68
+ "You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
69
+ "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
70
+ "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
71
+ "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
72
+ f"Today's date is {current_date}. "
73
+ f"CV: {full_cv_text}"
74
+ )
75
+
76
+ # Build messages for Gemini
77
+ messages = [types.Content(role="system", parts=[types.Part(text=system_prompt)])]
78
+
79
+ # Add conversation history
80
+ for msg in history:
81
+ role = "user" if msg["role"] == "user" else "model"
82
+ messages.append(types.Content(role=role, parts=[types.Part(text=msg["content"])]))
83
+
84
+ # Add current query
85
+ messages.append(types.Content(role="user", parts=[types.Part(text=query)]))
86
+
87
+ print(f"System prompt length: {len(system_prompt)}")
88
+ print(f"Number of messages: {len(messages)}")
89
+
90
+ # Mock the streaming response
91
+ response = gemini_client.models.generate_content_stream(
92
+ model=gemini_model,
93
+ contents=messages,
94
+ config=types.GenerateContentConfig(
95
+ temperature=0.3,
96
+ top_p=0.7,
97
+ max_output_tokens=512,
98
+ )
99
+ )
100
+
101
+ print("Streaming response:")
102
+ for chunk in response:
103
+ if chunk.text:
104
+ print(f"Chunk: {chunk.text}")
105
+
106
+ print("✅ Gemini integration test completed successfully!")
107
+
108
+ return True
109
+
110
+ if __name__ == "__main__":
111
+ asyncio.run(test_gemini_integration())