Spaces:
Sleeping
Sleeping
| import logging | |
| import numpy as np | |
| from utils.extensions import ext | |
| def retrieve(query, collection_name="law_documents", k=5): | |
| try: | |
| # Normalize query embedding | |
| query_embedding = ext.model.encode(query, normalize_embeddings=True).tolist() | |
| # Search in Qdrant | |
| search_result = ext.qdrant_client.query_points( | |
| collection_name=collection_name, | |
| query=query_embedding, | |
| limit=k, | |
| with_payload=True, | |
| with_vectors=False, | |
| ) | |
| logging.info(f"Search result type: {type(search_result)}") | |
| logging.info(f"Search result structure: {search_result}") | |
| # Prepare results | |
| results = [] | |
| # Qdrant trả về một object có thuộc tính 'points' hoặc trực tiếp là list | |
| # Xử lý tùy theo cấu trúc trả về | |
| if hasattr(search_result, 'points'): | |
| # Nếu có thuộc tính points | |
| points = search_result.points | |
| logging.info(f"Number of points: {len(points)}") | |
| for point in points: | |
| try: | |
| # Cố gắng lấy score và payload từ point | |
| if hasattr(point, 'score'): | |
| score = point.score | |
| payload = point.payload | |
| elif hasattr(point, 'vector') and hasattr(point, 'payload'): | |
| # Đôi khi Qdrant trả về điểm với vector thay vì score | |
| # Trong trường hợp này, chúng ta cần tính toán similarity | |
| score = 1.0 # Giả định điểm tương đồng cao | |
| payload = point.payload | |
| else: | |
| logging.warning(f"Unexpected point structure: {point}") | |
| continue | |
| # Xử lý text_path | |
| if 'text_path' not in payload: | |
| logging.warning(f"No text_path in payload: {payload}") | |
| continue | |
| cleaned_path = payload['text_path'].replace( | |
| '/content/drive/MyDrive/extracted_texts/', '' | |
| ).replace('.txt', '.pdf') | |
| # Clip and convert score to percentage | |
| score_clipped = np.clip(score, -1.0, 1.0) | |
| percentage_similarity = ((score_clipped + 1) / 2) * 100 | |
| results.append({ | |
| 'file': payload.get('file', 'Unknown'), | |
| 'folder': payload.get('folder', 'Unknown'), | |
| 'text_path': cleaned_path, | |
| 'content': payload.get('content', 'No content available'), | |
| 'percentage_similarity': percentage_similarity, | |
| 'cosine_score': score | |
| }) | |
| except Exception as e: | |
| logging.error(f"Error processing point: {e}") | |
| continue | |
| elif isinstance(search_result, list) or hasattr(search_result, '__iter__'): | |
| # Nếu là list hoặc iterable | |
| for item in search_result: | |
| try: | |
| # Xử lý tùy theo cấu trúc item | |
| if isinstance(item, tuple): | |
| # Tuple: (score, payload) hoặc (id, score, payload) | |
| if len(item) == 2: | |
| score, payload = item | |
| elif len(item) == 3: | |
| _, score, payload = item | |
| else: | |
| logging.warning(f"Unexpected tuple length: {len(item)}") | |
| continue | |
| elif hasattr(item, 'score') and hasattr(item, 'payload'): | |
| # Object với thuộc tính score và payload | |
| score = item.score | |
| payload = item.payload | |
| else: | |
| logging.warning(f"Unexpected item type: {type(item)}") | |
| continue | |
| # Xử lý text_path | |
| if 'text_path' not in payload: | |
| logging.warning(f"No text_path in payload: {payload}") | |
| continue | |
| cleaned_path = payload['text_path'].replace( | |
| '/content/drive/MyDrive/extracted_texts/', '' | |
| ).replace('.txt', '.pdf') | |
| # Clip and convert score to percentage | |
| score_clipped = np.clip(score, -1.0, 1.0) | |
| percentage_similarity = ((score_clipped + 1) / 2) * 100 | |
| results.append({ | |
| 'file': payload.get('file', 'Unknown'), | |
| 'folder': payload.get('folder', 'Unknown'), | |
| 'text_path': cleaned_path, | |
| 'content': payload.get('content', 'No content available'), | |
| 'percentage_similarity': percentage_similarity, | |
| 'cosine_score': score | |
| }) | |
| except Exception as e: | |
| logging.error(f"Error processing item: {e}") | |
| continue | |
| else: | |
| logging.error(f"Unexpected search result structure: {search_result}") | |
| return [] | |
| logging.info(f"Retrieved {len(results)} documents") | |
| return results | |
| except Exception as e: | |
| logging.error(f"Error during retrieval: {e}", exc_info=True) | |
| return [] |