File size: 2,653 Bytes
f64b3f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d10354
f64b3f9
 
7d10354
 
 
 
f64b3f9
7d10354
f64b3f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import json
import argparse
from pathlib import Path
from typing import List, Dict, Tuple
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings as ChromaSettings
import faiss
import pickle

DEFAULT_CHUNK_TOKENS = 200
DEFAULT_OVERLAP_TOKENS = 50

def approx_tokenize(text: str) -> List[str]:
    return text.split()

def detokenize(tokens: List[str]) -> str:
    return " ".join(tokens)

def chunk_text(text: str, chunk_tokens: int, overlap_tokens: int) -> List[str]:
    tokens = approx_tokenize(text)
    chunks = []
    i = 0
    n = len(tokens)
    while i < n:
        j = min(i + chunk_tokens, n)
        chunk = detokenize(tokens[i:j])
        if chunk.strip():
            chunks.append(chunk)
        if j == n:
            break
        i = j - overlap_tokens
        if i < 0:
            i = 0
    return chunks

def index_note(
    text: str,
    note_id: str = "temp_note",
    persist_dir: str = "./data/vector_store",
    db_type: str = "chroma",
    model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
    collection: str = "notes"
) -> str:
    os.makedirs(persist_dir, exist_ok=True)
    model = SentenceTransformer(model_name)
    chunks = chunk_text(text, DEFAULT_CHUNK_TOKENS, DEFAULT_OVERLAP_TOKENS)
    chunk_ids = [f"{note_id}::chunk_{i}" for i in range(len(chunks))]
    metadatas = [{"note_id": note_id, "chunk_index": i} for i in range(len(chunks))]
    vectors = model.encode(chunks, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)

    if db_type == "chroma":
        # FIX: Use get_or_create with consistent settings
        client = chromadb.PersistentClient(
            path=persist_dir,
            settings=ChromaSettings(
                allow_reset=False,  # Changed to False for consistency
                anonymized_telemetry=False
            )
        )
        coll = client.get_or_create_collection(collection)
        coll.upsert(
            ids=chunk_ids,
            embeddings=vectors.tolist(),
            documents=chunks,
            metadatas=metadatas,
        )
    elif db_type == "faiss":
        d = vectors.shape[1]
        index = faiss.IndexFlatIP(d)
        index.add(vectors)
        vectors_meta = [
            {"id": chunk_ids[k], "text": chunks[k], "meta": metadatas[k]}
            for k in range(len(chunks))
        ]
        faiss_path = os.path.join(persist_dir, "index.faiss")
        meta_path = os.path.join(persist_dir, "meta.pkl")
        faiss.write_index(index, faiss_path)
        with open(meta_path, "wb") as f:
            pickle.dump(vectors_meta, f)

    return note_id