File size: 6,003 Bytes
08b74f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import logging
import numpy as np

from utils.extensions import ext

def retrieve(query, collection_name="law_documents", k=5):
    try:
        # Normalize query embedding
        query_embedding = ext.model.encode(query, normalize_embeddings=True).tolist()
        
        # Search in Qdrant
        search_result = ext.qdrant_client.query_points(
            collection_name=collection_name,
            query=query_embedding,
            limit=k,
            with_payload=True,
            with_vectors=False,
        )
        
        logging.info(f"Search result type: {type(search_result)}")
        logging.info(f"Search result structure: {search_result}")
                
        # Prepare results
        results = []
        
        # Qdrant trả về một object có thuộc tính 'points' hoặc trực tiếp là list
        # Xử lý tùy theo cấu trúc trả về
        if hasattr(search_result, 'points'):
            # Nếu có thuộc tính points
            points = search_result.points
            logging.info(f"Number of points: {len(points)}")
            
            for point in points:
                try:
                    # Cố gắng lấy score và payload từ point
                    if hasattr(point, 'score'):
                        score = point.score
                        payload = point.payload
                    elif hasattr(point, 'vector') and hasattr(point, 'payload'):
                        # Đôi khi Qdrant trả về điểm với vector thay vì score
                        # Trong trường hợp này, chúng ta cần tính toán similarity
                        score = 1.0  # Giả định điểm tương đồng cao
                        payload = point.payload
                    else:
                        logging.warning(f"Unexpected point structure: {point}")
                        continue
                    
                    # Xử lý text_path
                    if 'text_path' not in payload:
                        logging.warning(f"No text_path in payload: {payload}")
                        continue
                        
                    cleaned_path = payload['text_path'].replace(
                        '/content/drive/MyDrive/extracted_texts/', ''
                    ).replace('.txt', '.pdf')
                    
                    # Clip and convert score to percentage
                    score_clipped = np.clip(score, -1.0, 1.0)
                    percentage_similarity = ((score_clipped + 1) / 2) * 100
                    
                    results.append({
                        'file': payload.get('file', 'Unknown'),
                        'folder': payload.get('folder', 'Unknown'),
                        'text_path': cleaned_path,
                        'content': payload.get('content', 'No content available'),
                        'percentage_similarity': percentage_similarity,
                        'cosine_score': score
                    })
                    
                except Exception as e:
                    logging.error(f"Error processing point: {e}")
                    continue
                    
        elif isinstance(search_result, list) or hasattr(search_result, '__iter__'):
            # Nếu là list hoặc iterable
            for item in search_result:
                try:
                    # Xử lý tùy theo cấu trúc item
                    if isinstance(item, tuple):
                        # Tuple: (score, payload) hoặc (id, score, payload)
                        if len(item) == 2:
                            score, payload = item
                        elif len(item) == 3:
                            _, score, payload = item
                        else:
                            logging.warning(f"Unexpected tuple length: {len(item)}")
                            continue
                    elif hasattr(item, 'score') and hasattr(item, 'payload'):
                        # Object với thuộc tính score và payload
                        score = item.score
                        payload = item.payload
                    else:
                        logging.warning(f"Unexpected item type: {type(item)}")
                        continue
                    
                    # Xử lý text_path
                    if 'text_path' not in payload:
                        logging.warning(f"No text_path in payload: {payload}")
                        continue
                        
                    cleaned_path = payload['text_path'].replace(
                        '/content/drive/MyDrive/extracted_texts/', ''
                    ).replace('.txt', '.pdf')
                    
                    # Clip and convert score to percentage
                    score_clipped = np.clip(score, -1.0, 1.0)
                    percentage_similarity = ((score_clipped + 1) / 2) * 100
                    
                    results.append({
                        'file': payload.get('file', 'Unknown'),
                        'folder': payload.get('folder', 'Unknown'),
                        'text_path': cleaned_path,
                        'content': payload.get('content', 'No content available'),
                        'percentage_similarity': percentage_similarity,
                        'cosine_score': score
                    })
                    
                except Exception as e:
                    logging.error(f"Error processing item: {e}")
                    continue
        else:
            logging.error(f"Unexpected search result structure: {search_result}")
            return []
        
        logging.info(f"Retrieved {len(results)} documents")
        return results
        
    except Exception as e:
        logging.error(f"Error during retrieval: {e}", exc_info=True)
        return []