LegalMind12 / services /retriever.py
Nguyendat92929's picture
Upload 91 files
08b74f7 verified
import logging
import numpy as np
from utils.extensions import ext
def retrieve(query, collection_name="law_documents", k=5):
try:
# Normalize query embedding
query_embedding = ext.model.encode(query, normalize_embeddings=True).tolist()
# Search in Qdrant
search_result = ext.qdrant_client.query_points(
collection_name=collection_name,
query=query_embedding,
limit=k,
with_payload=True,
with_vectors=False,
)
logging.info(f"Search result type: {type(search_result)}")
logging.info(f"Search result structure: {search_result}")
# Prepare results
results = []
# Qdrant trả về một object có thuộc tính 'points' hoặc trực tiếp là list
# Xử lý tùy theo cấu trúc trả về
if hasattr(search_result, 'points'):
# Nếu có thuộc tính points
points = search_result.points
logging.info(f"Number of points: {len(points)}")
for point in points:
try:
# Cố gắng lấy score và payload từ point
if hasattr(point, 'score'):
score = point.score
payload = point.payload
elif hasattr(point, 'vector') and hasattr(point, 'payload'):
# Đôi khi Qdrant trả về điểm với vector thay vì score
# Trong trường hợp này, chúng ta cần tính toán similarity
score = 1.0 # Giả định điểm tương đồng cao
payload = point.payload
else:
logging.warning(f"Unexpected point structure: {point}")
continue
# Xử lý text_path
if 'text_path' not in payload:
logging.warning(f"No text_path in payload: {payload}")
continue
cleaned_path = payload['text_path'].replace(
'/content/drive/MyDrive/extracted_texts/', ''
).replace('.txt', '.pdf')
# Clip and convert score to percentage
score_clipped = np.clip(score, -1.0, 1.0)
percentage_similarity = ((score_clipped + 1) / 2) * 100
results.append({
'file': payload.get('file', 'Unknown'),
'folder': payload.get('folder', 'Unknown'),
'text_path': cleaned_path,
'content': payload.get('content', 'No content available'),
'percentage_similarity': percentage_similarity,
'cosine_score': score
})
except Exception as e:
logging.error(f"Error processing point: {e}")
continue
elif isinstance(search_result, list) or hasattr(search_result, '__iter__'):
# Nếu là list hoặc iterable
for item in search_result:
try:
# Xử lý tùy theo cấu trúc item
if isinstance(item, tuple):
# Tuple: (score, payload) hoặc (id, score, payload)
if len(item) == 2:
score, payload = item
elif len(item) == 3:
_, score, payload = item
else:
logging.warning(f"Unexpected tuple length: {len(item)}")
continue
elif hasattr(item, 'score') and hasattr(item, 'payload'):
# Object với thuộc tính score và payload
score = item.score
payload = item.payload
else:
logging.warning(f"Unexpected item type: {type(item)}")
continue
# Xử lý text_path
if 'text_path' not in payload:
logging.warning(f"No text_path in payload: {payload}")
continue
cleaned_path = payload['text_path'].replace(
'/content/drive/MyDrive/extracted_texts/', ''
).replace('.txt', '.pdf')
# Clip and convert score to percentage
score_clipped = np.clip(score, -1.0, 1.0)
percentage_similarity = ((score_clipped + 1) / 2) * 100
results.append({
'file': payload.get('file', 'Unknown'),
'folder': payload.get('folder', 'Unknown'),
'text_path': cleaned_path,
'content': payload.get('content', 'No content available'),
'percentage_similarity': percentage_similarity,
'cosine_score': score
})
except Exception as e:
logging.error(f"Error processing item: {e}")
continue
else:
logging.error(f"Unexpected search result structure: {search_result}")
return []
logging.info(f"Retrieved {len(results)} documents")
return results
except Exception as e:
logging.error(f"Error during retrieval: {e}", exc_info=True)
return []