Spaces:
Sleeping
Sleeping
File size: 6,003 Bytes
08b74f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import logging
import numpy as np
from utils.extensions import ext
def retrieve(query, collection_name="law_documents", k=5):
try:
# Normalize query embedding
query_embedding = ext.model.encode(query, normalize_embeddings=True).tolist()
# Search in Qdrant
search_result = ext.qdrant_client.query_points(
collection_name=collection_name,
query=query_embedding,
limit=k,
with_payload=True,
with_vectors=False,
)
logging.info(f"Search result type: {type(search_result)}")
logging.info(f"Search result structure: {search_result}")
# Prepare results
results = []
# Qdrant trả về một object có thuộc tính 'points' hoặc trực tiếp là list
# Xử lý tùy theo cấu trúc trả về
if hasattr(search_result, 'points'):
# Nếu có thuộc tính points
points = search_result.points
logging.info(f"Number of points: {len(points)}")
for point in points:
try:
# Cố gắng lấy score và payload từ point
if hasattr(point, 'score'):
score = point.score
payload = point.payload
elif hasattr(point, 'vector') and hasattr(point, 'payload'):
# Đôi khi Qdrant trả về điểm với vector thay vì score
# Trong trường hợp này, chúng ta cần tính toán similarity
score = 1.0 # Giả định điểm tương đồng cao
payload = point.payload
else:
logging.warning(f"Unexpected point structure: {point}")
continue
# Xử lý text_path
if 'text_path' not in payload:
logging.warning(f"No text_path in payload: {payload}")
continue
cleaned_path = payload['text_path'].replace(
'/content/drive/MyDrive/extracted_texts/', ''
).replace('.txt', '.pdf')
# Clip and convert score to percentage
score_clipped = np.clip(score, -1.0, 1.0)
percentage_similarity = ((score_clipped + 1) / 2) * 100
results.append({
'file': payload.get('file', 'Unknown'),
'folder': payload.get('folder', 'Unknown'),
'text_path': cleaned_path,
'content': payload.get('content', 'No content available'),
'percentage_similarity': percentage_similarity,
'cosine_score': score
})
except Exception as e:
logging.error(f"Error processing point: {e}")
continue
elif isinstance(search_result, list) or hasattr(search_result, '__iter__'):
# Nếu là list hoặc iterable
for item in search_result:
try:
# Xử lý tùy theo cấu trúc item
if isinstance(item, tuple):
# Tuple: (score, payload) hoặc (id, score, payload)
if len(item) == 2:
score, payload = item
elif len(item) == 3:
_, score, payload = item
else:
logging.warning(f"Unexpected tuple length: {len(item)}")
continue
elif hasattr(item, 'score') and hasattr(item, 'payload'):
# Object với thuộc tính score và payload
score = item.score
payload = item.payload
else:
logging.warning(f"Unexpected item type: {type(item)}")
continue
# Xử lý text_path
if 'text_path' not in payload:
logging.warning(f"No text_path in payload: {payload}")
continue
cleaned_path = payload['text_path'].replace(
'/content/drive/MyDrive/extracted_texts/', ''
).replace('.txt', '.pdf')
# Clip and convert score to percentage
score_clipped = np.clip(score, -1.0, 1.0)
percentage_similarity = ((score_clipped + 1) / 2) * 100
results.append({
'file': payload.get('file', 'Unknown'),
'folder': payload.get('folder', 'Unknown'),
'text_path': cleaned_path,
'content': payload.get('content', 'No content available'),
'percentage_similarity': percentage_similarity,
'cosine_score': score
})
except Exception as e:
logging.error(f"Error processing item: {e}")
continue
else:
logging.error(f"Unexpected search result structure: {search_result}")
return []
logging.info(f"Retrieved {len(results)} documents")
return results
except Exception as e:
logging.error(f"Error during retrieval: {e}", exc_info=True)
return [] |