doggdad commited on
Commit
843111c
·
verified ·
1 Parent(s): f4dd0da

Upload 51 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. src/__pycache__/utils.cpython-313.pyc +0 -0
  3. src/app.py +288 -0
  4. src/crud/__pycache__/vector_store.cpython-313.pyc +0 -0
  5. src/crud/vector_store.py +140 -0
  6. src/data/images/car_1.jpg +3 -0
  7. src/data/images/car_2.jpg +3 -0
  8. src/data/images/cat_1.jpg +0 -0
  9. src/data/images/cat_2.jpg +0 -0
  10. src/data/images/cat_3.jpg +0 -0
  11. src/data/images/motorcycle_1.jpg +0 -0
  12. src/data/images/motorcycle_2.jpg +3 -0
  13. src/data/images/motorcycle_3.jpg +3 -0
  14. src/embedding_creation.ipynb +3 -0
  15. src/mm_rag.ipynb +0 -0
  16. src/preprocess/__pycache__/embedding.cpython-313.pyc +0 -0
  17. src/preprocess/__pycache__/preprocessing.cpython-313.pyc +0 -0
  18. src/preprocess/embedding.py +69 -0
  19. src/preprocess/preprocessing.py +65 -0
  20. src/preprocessing_video.ipynb +0 -0
  21. src/shared_data/videos/video1/7Hcg-rLYwdM.en.vtt +85 -0
  22. src/shared_data/videos/video1/Welcome back to Planet Earth.mp4 +3 -0
  23. src/shared_data/videos/video1/audio.mp3 +3 -0
  24. src/shared_data/videos/video1/extracted_frame/frame_0.jpg +0 -0
  25. src/shared_data/videos/video1/extracted_frame/frame_1.jpg +0 -0
  26. src/shared_data/videos/video1/extracted_frame/frame_10.jpg +0 -0
  27. src/shared_data/videos/video1/extracted_frame/frame_11.jpg +0 -0
  28. src/shared_data/videos/video1/extracted_frame/frame_12.jpg +0 -0
  29. src/shared_data/videos/video1/extracted_frame/frame_13.jpg +0 -0
  30. src/shared_data/videos/video1/extracted_frame/frame_14.jpg +0 -0
  31. src/shared_data/videos/video1/extracted_frame/frame_15.jpg +0 -0
  32. src/shared_data/videos/video1/extracted_frame/frame_16.jpg +0 -0
  33. src/shared_data/videos/video1/extracted_frame/frame_17.jpg +0 -0
  34. src/shared_data/videos/video1/extracted_frame/frame_18.jpg +0 -0
  35. src/shared_data/videos/video1/extracted_frame/frame_19.jpg +0 -0
  36. src/shared_data/videos/video1/extracted_frame/frame_2.jpg +0 -0
  37. src/shared_data/videos/video1/extracted_frame/frame_20.jpg +0 -0
  38. src/shared_data/videos/video1/extracted_frame/frame_21.jpg +0 -0
  39. src/shared_data/videos/video1/extracted_frame/frame_22.jpg +0 -0
  40. src/shared_data/videos/video1/extracted_frame/frame_23.jpg +0 -0
  41. src/shared_data/videos/video1/extracted_frame/frame_24.jpg +0 -0
  42. src/shared_data/videos/video1/extracted_frame/frame_25.jpg +0 -0
  43. src/shared_data/videos/video1/extracted_frame/frame_3.jpg +0 -0
  44. src/shared_data/videos/video1/extracted_frame/frame_4.jpg +0 -0
  45. src/shared_data/videos/video1/extracted_frame/frame_5.jpg +0 -0
  46. src/shared_data/videos/video1/extracted_frame/frame_6.jpg +0 -0
  47. src/shared_data/videos/video1/extracted_frame/frame_7.jpg +0 -0
  48. src/shared_data/videos/video1/extracted_frame/frame_8.jpg +0 -0
  49. src/shared_data/videos/video1/extracted_frame/frame_9.jpg +0 -0
  50. src/shared_data/videos/video1/generated_captions.vtt +71 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ src/data/images/car_1.jpg filter=lfs diff=lfs merge=lfs -text
37
+ src/data/images/car_2.jpg filter=lfs diff=lfs merge=lfs -text
38
+ src/data/images/motorcycle_2.jpg filter=lfs diff=lfs merge=lfs -text
39
+ src/data/images/motorcycle_3.jpg filter=lfs diff=lfs merge=lfs -text
40
+ src/embedding_creation.ipynb filter=lfs diff=lfs merge=lfs -text
41
+ src/shared_data/videos/video1/audio.mp3 filter=lfs diff=lfs merge=lfs -text
42
+ src/shared_data/videos/video1/Welcome[[:space:]]back[[:space:]]to[[:space:]]Planet[[:space:]]Earth.mp4 filter=lfs diff=lfs merge=lfs -text
src/__pycache__/utils.cpython-313.pyc ADDED
Binary file (13.8 kB). View file
 
src/app.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import os
3
+ from os import path as osp
4
+ import gradio as gr
5
+ from dotenv import load_dotenv
6
+ from crud.vector_store import MultimodalLanceDB
7
+ from preprocess.embedding import BridgeTowerEmbeddings
8
+ from preprocess.preprocessing import extract_and_save_frames_and_metadata
9
+ #from utils import encode_image
10
+ from utils import (
11
+ download_video,
12
+ get_transcript_vtt,
13
+ download_youtube_subtitle,
14
+ get_video_id_from_url,
15
+ str2time,
16
+ maintain_aspect_ratio_resize,
17
+ getSubs,
18
+ encode_image,
19
+ )
20
+ from mistralai import Mistral
21
+ from langchain_core.runnables import (
22
+ RunnableParallel,
23
+ RunnablePassthrough,
24
+ RunnableLambda
25
+ )
26
+ from PIL import Image
27
+
28
+ import lancedb
29
+
30
+ # -------------------------------
31
+ # 1. Setup
32
+ # -------------------------------
33
+ load_dotenv()
34
+ if os.getenv("SPACE_ID"):
35
+ LANCEDB_HOST_FILE = "/tmp/.lancedb"
36
+ os.makedirs("/tmp", exist_ok=True)
37
+ else:
38
+ LANCEDB_HOST_FILE = "./shared_data/.lancedb"
39
+ TBL_NAME = "vectorstore"
40
+
41
+ db = lancedb.connect(LANCEDB_HOST_FILE)
42
+ embedder = BridgeTowerEmbeddings()
43
+
44
+ # -------------------------------
45
+ # 2. Preprocessing + Storage
46
+ # -------------------------------
47
+ def preprocess_and_store(youtube_url: str):
48
+ """Download video, extract frames+metadata, embed & store in LanceDB"""
49
+
50
+ video_url = youtube_url
51
+
52
+ if os.getenv("SPACE_ID"):
53
+ video_dir = "/tmp/videos/video1"
54
+ else:
55
+ video_dir = "./shared_data/videos/video1"
56
+ # download Youtube video to ./shared_data/videos/video1
57
+ video_filepath = download_video(video_url, video_dir)
58
+
59
+ # download Youtube video's subtitle to ./shared_data/videos/video1
60
+ video_transcript_filepath = download_youtube_subtitle(video_url, video_dir)
61
+
62
+ extracted_frames_path = osp.join(video_dir, 'extracted_frame')
63
+
64
+ # create these output folders if not existing
65
+ Path(extracted_frames_path).mkdir(parents=True, exist_ok=True)
66
+ Path(video_dir).mkdir(parents=True, exist_ok=True)
67
+
68
+ # call the function to extract frames and metadatas
69
+ metadatas = extract_and_save_frames_and_metadata(
70
+ video_filepath,
71
+ video_transcript_filepath,
72
+ extracted_frames_path,
73
+ video_dir,
74
+ )
75
+ # collect transcripts and image paths
76
+ video_trans = [vid['transcript'] for vid in metadatas]
77
+ video_img_path = [vid['extracted_frame_path'] for vid in metadatas]
78
+
79
+ n = 7
80
+ updated_video_trans = [
81
+ ' '.join(video_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
82
+ ' '.join(video_trans[0 : i + int(n/2)]) for i in range(len(video_trans))
83
+ ]
84
+ # also need to update the updated transcripts in metadata
85
+ for i in range(len(updated_video_trans)):
86
+ metadatas[i]['transcript'] = updated_video_trans[i]
87
+
88
+ _ = MultimodalLanceDB.from_text_image_pairs(
89
+ texts=updated_video_trans,
90
+ image_paths=video_img_path,
91
+ embedding=embedder,
92
+ metadatas=metadatas,
93
+ connection=db,
94
+ table_name=TBL_NAME,
95
+ mode="overwrite",
96
+ )
97
+ return f"✅ Video processed and stored: {youtube_url}"
98
+
99
+ # -------------------------------
100
+ # 3. Retrieval + Prompt Functions
101
+ # -------------------------------
102
+ vectorstore = MultimodalLanceDB(
103
+ uri=LANCEDB_HOST_FILE,
104
+ embedding=embedder,
105
+ table_name=TBL_NAME
106
+ )
107
+
108
+ retriever_module = vectorstore.as_retriever(
109
+ search_type="similarity",
110
+ search_kwargs={"k": 3}
111
+ )
112
+
113
+ def prompt_processing(input):
114
+ retrieved_results = input["retrieved_results"]
115
+ user_query = input["user_query"]
116
+ #retrieved_results = retriever_module.invoke(user_query)
117
+
118
+ retrieved_results = retrieved_results[0]
119
+ prompt_template = (
120
+ "The transcript associated with the image is '{transcript}'. "
121
+ "{user_query}"
122
+ )
123
+
124
+ retrieved_metadata = retrieved_results.metadata
125
+ transcript = retrieved_metadata["transcript"]
126
+ frame_path = retrieved_metadata["extracted_frame_path"]
127
+
128
+ return {
129
+ "prompt": prompt_template.format(transcript=transcript, user_query=user_query),
130
+ "frame_path": frame_path,
131
+ }
132
+
133
+
134
+ def lvlm_inference(input):
135
+
136
+ # get the retrieved results and user's query
137
+ lvlm_prompt = input['prompt']
138
+ frame_path = input['frame_path']
139
+
140
+
141
+ # Retrieve the API key from environment variables
142
+ api_key = os.getenv("MISTRAL_API_KEY")
143
+
144
+ # Initialize the Mistral client
145
+ client = Mistral(api_key=api_key)
146
+
147
+ base64_image = encode_image(frame_path)
148
+
149
+ # Define the messages for the chat
150
+ messages = [
151
+ {
152
+ "role": "user",
153
+ "content": [
154
+ {
155
+ "type": "text",
156
+ "text": lvlm_prompt
157
+ },
158
+ {
159
+ "type": "image_url",
160
+ "image_url": f"data:image/jpeg;base64,{base64_image}"
161
+ }
162
+ ]
163
+ }
164
+ ]
165
+
166
+ # Get the chat response
167
+ chat_response = client.chat.complete(
168
+ model="pixtral-12b-2409",
169
+ messages=messages
170
+ )
171
+
172
+ # Print the content of the response
173
+ return chat_response.choices[0].message.content, frame_path
174
+
175
+ # LangChain Runnable chain
176
+ prompt_processing_module = RunnableLambda(prompt_processing)
177
+ lvlm_inference_module = RunnableLambda(lvlm_inference)
178
+
179
+ mm_rag_chain = (
180
+ RunnableParallel({"retrieved_results": retriever_module, "user_query": RunnablePassthrough()})
181
+ | prompt_processing_module
182
+ | lvlm_inference_module
183
+ )
184
+
185
+ # -------------------------------
186
+ # 4. Chat API for Gradio
187
+ # -------------------------------
188
+ video_loaded = False
189
+
190
+ def load_video(youtube_url):
191
+ global video_loaded
192
+ status = preprocess_and_store(youtube_url)
193
+ video_loaded = True
194
+ return status
195
+
196
+ def chat_interface(message, history):
197
+ if not video_loaded:
198
+ return "", history, None
199
+
200
+ final_text_response, frame_path = mm_rag_chain.invoke(message)
201
+ history.append((message, final_text_response))
202
+
203
+ # Load and return the image
204
+ try:
205
+ retrieved_image = Image.open(frame_path)
206
+ except Exception as e:
207
+ print(f"Error loading image: {e}")
208
+ retrieved_image = None
209
+
210
+ return "", history, retrieved_image
211
+
212
+ # -------------------------------
213
+ # 5. Enhanced Gradio Interface
214
+ # -------------------------------
215
+ with gr.Blocks(title="Multimodal RAG Video Chat") as demo:
216
+ gr.Markdown("# 🎬 Multimodal RAG Video Chat\nChat with YouTube clips using BridgeTower + LanceDB + Pixtral!")
217
+
218
+ with gr.Tab("1. Load Video"):
219
+ youtube_url = gr.Textbox(
220
+ label="YouTube URL",
221
+ placeholder="Paste a YouTube link here...",
222
+ lines=1
223
+ )
224
+ load_btn = gr.Button("Process Video", variant="primary")
225
+ status = gr.Textbox(label="Status", interactive=False)
226
+ load_btn.click(load_video, inputs=youtube_url, outputs=status)
227
+
228
+ with gr.Tab("2. Chat with Video"):
229
+ with gr.Row():
230
+ with gr.Column(scale=2):
231
+ chatbot = gr.Chatbot(
232
+ label="Chat about the video",
233
+ height=500,
234
+ show_label=True
235
+ )
236
+
237
+ with gr.Column(scale=1):
238
+ retrieved_image = gr.Image(
239
+ label="Retrieved Frame",
240
+ height=400,
241
+ show_label=True,
242
+ interactive=False
243
+ )
244
+
245
+ with gr.Row():
246
+ msg = gr.Textbox(
247
+ label="Your question",
248
+ placeholder="Ask something about the video...",
249
+ lines=2,
250
+ scale=4
251
+ )
252
+ send_btn = gr.Button("Send", variant="primary", scale=1)
253
+
254
+ # Clear message after sending
255
+ msg.submit(chat_interface, inputs=[msg, chatbot], outputs=[msg, chatbot, retrieved_image])
256
+ send_btn.click(chat_interface, inputs=[msg, chatbot], outputs=[msg, chatbot, retrieved_image])
257
+
258
+ # Add some usage instructions
259
+ with gr.Tab("📖 Instructions"):
260
+ gr.Markdown("""
261
+ ## How to use this Multimodal RAG system:
262
+
263
+ 1. **Load Video**:
264
+ - Go to the "Load Video" tab
265
+ - Paste a YouTube URL
266
+ - Click "Process Video" and wait for processing to complete
267
+
268
+ 2. **Chat with Video**:
269
+ - Go to the "Chat with Video" tab
270
+ - Ask questions about the video content
271
+ - The system will retrieve the most relevant frame and provide answers
272
+ - The retrieved frame will be displayed on the right side
273
+
274
+ ## Features:
275
+ - 🎥 Processes YouTube videos automatically
276
+ - 🧠 Uses BridgeTower for multimodal embeddings
277
+ - 💾 Stores data in LanceDB vector database
278
+ - 🤖 Powered by Pixtral vision-language model
279
+ - 🖼️ Shows relevant video frames alongside responses
280
+ """)
281
+
282
+ if __name__ == "__main__":
283
+ print('App starting...')
284
+ # For HF Spaces, use default host and port
285
+ if os.getenv("SPACE_ID"):
286
+ demo.launch()
287
+ else:
288
+ demo.launch(server_name="0.0.0.0", server_port=7860)
src/crud/__pycache__/vector_store.cpython-313.pyc ADDED
Binary file (6.15 kB). View file
 
src/crud/vector_store.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Iterable, List, Optional
2
+ from langchain_core.embeddings import Embeddings
3
+ import uuid
4
+ from langchain_community.vectorstores.lancedb import LanceDB
5
+
6
+ class MultimodalLanceDB(LanceDB):
7
+ """`LanceDB` vector store to process multimodal data
8
+
9
+ Parameters:
10
+ -----------
11
+ connection: Any
12
+ LanceDB connection to use. If not provided, a new connection will be created.
13
+ embedding: Embeddings
14
+ Embedding to use for the vectorstore.
15
+ vector_key: str
16
+ Key to use for the vector in the database. Defaults to ``vector``.
17
+ id_key: str
18
+ Key to use for the id in the database. Defaults to ``id``.
19
+ text_key: str
20
+ Key to use for the text in the database. Defaults to ``text``.
21
+ image_path_key: str
22
+ Key to use for the path to image in the database. Defaults to ``image_path``.
23
+ table_name: str
24
+ Name of the table to use. Defaults to ``vectorstore``.
25
+ api_key: str
26
+ API key to use for LanceDB cloud database.
27
+ region: str
28
+ Region to use for LanceDB cloud database.
29
+ mode: str
30
+ Mode to use for adding data to the table. Defaults to ``overwrite``.
31
+
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ connection: Optional[Any] = None,
37
+ embedding: Optional[Embeddings] = None,
38
+ uri: Optional[str] = "/tmp/lancedb",
39
+ vector_key: Optional[str] = "vector",
40
+ id_key: Optional[str] = "id",
41
+ text_key: Optional[str] = "text",
42
+ image_path_key: Optional[str] = "image_path",
43
+ table_name: Optional[str] = "vectorstore",
44
+ api_key: Optional[str] = None,
45
+ region: Optional[str] = None,
46
+ mode: Optional[str] = "append",
47
+ ):
48
+ super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
49
+ self._image_path_key = image_path_key
50
+
51
+ def add_text_image_pairs(
52
+ self,
53
+ texts: Iterable[str],
54
+ image_paths: Iterable[str],
55
+ metadatas: Optional[List[dict]] = None,
56
+ ids: Optional[List[str]] = None,
57
+ **kwargs: Any,
58
+ ) -> List[str]:
59
+ """Turn text-image pairs into embedding and add it to the database
60
+
61
+ Parameters:
62
+ ----------
63
+ texts: Iterable[str]
64
+ Iterable of strings to combine with corresponding images to add to the vectorstore.
65
+ images: Iterable[str]
66
+ Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
67
+ metadatas: List[str]
68
+ Optional list of metadatas associated with the texts.
69
+ ids: List[str]
70
+ Optional list of ids to associate with the texts.
71
+
72
+ Returns:
73
+ --------
74
+ List of ids of the added text-image pairs.
75
+ """
76
+ # the length of texts must be equal to the length of images
77
+ assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
78
+
79
+ print(f'The length of texts is {len(texts)}')
80
+
81
+ # Embed texts and create documents
82
+ docs = []
83
+ ids = ids or [str(uuid.uuid4()) for _ in texts]
84
+ embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths)) # type: ignore
85
+ for idx, text in enumerate(texts):
86
+ embedding = embeddings[idx]
87
+ metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
88
+ docs.append(
89
+ {
90
+ self._vector_key: embedding,
91
+ self._id_key: ids[idx],
92
+ self._text_key: text,
93
+ self._image_path_key : image_paths[idx],
94
+ "metadata": metadata,
95
+ }
96
+ )
97
+ print(f'Adding {len(docs)} text-image pairs to the vectorstore...')
98
+
99
+ if 'mode' in kwargs:
100
+ mode = kwargs['mode']
101
+ else:
102
+ mode = self.mode
103
+ if self._table_name in self._connection.table_names():
104
+ tbl = self._connection.open_table(self._table_name)
105
+ if self.api_key is None:
106
+ tbl.add(docs)
107
+ else:
108
+ tbl.add(docs)
109
+ else:
110
+ self._connection.create_table(self._table_name, data=docs)
111
+ return ids
112
+
113
+ @classmethod
114
+ def from_text_image_pairs(
115
+ cls,
116
+ texts: List[str],
117
+ image_paths: List[str],
118
+ embedding: Embeddings,
119
+ metadatas: Optional[List[dict]] = None,
120
+ connection: Any = None,
121
+ vector_key: Optional[str] = "vector",
122
+ id_key: Optional[str] = "id",
123
+ text_key: Optional[str] = "text",
124
+ image_path_key: Optional[str] = "image_path",
125
+ table_name: Optional[str] = "vectorstore",
126
+ **kwargs: Any,
127
+ ):
128
+
129
+ instance = MultimodalLanceDB(
130
+ connection=connection,
131
+ embedding=embedding,
132
+ vector_key=vector_key,
133
+ id_key=id_key,
134
+ text_key=text_key,
135
+ image_path_key=image_path_key,
136
+ table_name=table_name,
137
+ )
138
+ instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
139
+
140
+ return instance
src/data/images/car_1.jpg ADDED

Git LFS Details

  • SHA256: 7093f1c09568aece012f93050a14f2c272f4f35f16d88030ccf2ac6a88d19f28
  • Pointer size: 132 Bytes
  • Size of remote file: 1.52 MB
src/data/images/car_2.jpg ADDED

Git LFS Details

  • SHA256: b1baf7d58f14bcbb6c0ac143c93f1b3b8972a6f544a81790e54594142888f6cc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.6 MB
src/data/images/cat_1.jpg ADDED
src/data/images/cat_2.jpg ADDED
src/data/images/cat_3.jpg ADDED
src/data/images/motorcycle_1.jpg ADDED
src/data/images/motorcycle_2.jpg ADDED

Git LFS Details

  • SHA256: cebf0ac3e43fa6246fc07de3d147aac3678083efa66dcb2304488d7a8754ce2e
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
src/data/images/motorcycle_3.jpg ADDED

Git LFS Details

  • SHA256: 6cedc5acbaa790d4a26fe35c4bde211ec94f6e30d67f77b6324e3b5f30338048
  • Pointer size: 132 Bytes
  • Size of remote file: 1.3 MB
src/embedding_creation.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8d2b46e0c0b041904c02be7a0878a8b6b59e0ee98fff649bd8a7b38134c2dc6
3
+ size 47954568
src/mm_rag.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
src/preprocess/__pycache__/embedding.cpython-313.pyc ADDED
Binary file (2.9 kB). View file
 
src/preprocess/__pycache__/preprocessing.cpython-313.pyc ADDED
Binary file (2.19 kB). View file
 
src/preprocess/embedding.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import encode_image
2
+ from utils import bt_embeddings
3
+ from tqdm import tqdm
4
+ from typing import List
5
+ from langchain_core.embeddings import Embeddings
6
+ from langchain_core.pydantic_v1 import BaseModel
7
+
8
+ class BridgeTowerEmbeddings(BaseModel,Embeddings):
9
+ """ BridgeTower embedding model """
10
+
11
+ def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
12
+ """Embed a list of image-text pairs using BridgeTower.
13
+
14
+ Parameters:
15
+ -----------
16
+ texts: str
17
+ The list of texts to embed.
18
+ images: List
19
+ The list of path-to-images to embed
20
+ batch_size: int
21
+ The batch size to process, default to 2
22
+
23
+ Returns:
24
+ --------
25
+ List of embeddings, one for each image-text pairs.
26
+ """
27
+
28
+ # the length of texts must be equal to the length of images
29
+ assert len(texts)==len(images), "the len of captions should be equal to the len of images"
30
+
31
+ print(f"Embedding {len(texts)} image-text pairs...")
32
+
33
+ embeddings = []
34
+ for path_to_img, text in tqdm(zip(images, texts), total=len(images), desc="Processing pairs"):
35
+ embedding = bt_embeddings(text, encode_image(path_to_img))
36
+ embeddings.append(embedding)
37
+ return embeddings
38
+
39
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
40
+ """Embed a list of documents using BridgeTower.
41
+
42
+ Parameters:
43
+ -----------
44
+ texts: str
45
+ The list of texts to embed.
46
+
47
+ Returns:
48
+ --------
49
+ List of embeddings, one for each text.
50
+ """
51
+ embeddings = []
52
+ for text in texts:
53
+
54
+ embedding = bt_embeddings(text, "")
55
+ embeddings.append(embedding)
56
+ return embeddings
57
+
58
+ def embed_query(self, text: str) -> List[float]:
59
+ """Embed a query using BridgeTower.
60
+
61
+ Parameters:
62
+ -----------
63
+ texts: str
64
+ The text to embed.
65
+
66
+ Returns:
67
+ Embeddings for the text.
68
+ """
69
+ return self.embed_documents([text])[0]
src/preprocess/preprocessing.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import path as osp
2
+ import json
3
+
4
+ import cv2
5
+ import webvtt
6
+
7
+ from utils import maintain_aspect_ratio_resize, str2time
8
+
9
+ def extract_and_save_frames_and_metadata(
10
+ path_to_video,
11
+ path_to_transcript,
12
+ path_to_save_extracted_frames,
13
+ path_to_save_metadatas):
14
+
15
+ # metadatas will store the metadata of all extracted frames
16
+ metadatas = []
17
+
18
+ # load video using cv2
19
+ video = cv2.VideoCapture(path_to_video)
20
+ # load transcript using webvtt
21
+ trans = webvtt.read(path_to_transcript)
22
+
23
+ # iterate transcript file
24
+ # for each video segment specified in the transcript file
25
+ for idx, transcript in enumerate(trans):
26
+
27
+ # get the start time and end time in seconds
28
+ start_time_ms = str2time(transcript.start)
29
+ end_time_ms = str2time(transcript.end)
30
+ # get the time in ms exactly
31
+ # in the middle of start time and end time
32
+ mid_time_ms = (end_time_ms + start_time_ms) / 2
33
+ # get the transcript, remove the next-line symbol
34
+ text = transcript.text.replace("\n", ' ')
35
+ # get frame at the middle time
36
+ video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
37
+ success, frame = video.read()
38
+ if success:
39
+ # if the frame is extracted successfully, resize it
40
+ image = maintain_aspect_ratio_resize(frame, height=350)
41
+ # save frame as JPEG file
42
+ img_fname = f'frame_{idx}.jpg'
43
+ img_fpath = osp.join(
44
+ path_to_save_extracted_frames, img_fname
45
+ )
46
+ cv2.imwrite(img_fpath, image)
47
+
48
+ # prepare the metadata
49
+ metadata = {
50
+ 'extracted_frame_path': img_fpath,
51
+ 'transcript': text,
52
+ 'video_segment_id': idx,
53
+ 'video_path': path_to_video,
54
+ 'mid_time_ms': mid_time_ms,
55
+ }
56
+ metadatas.append(metadata)
57
+
58
+ else:
59
+ print(f"ERROR! Cannot extract frame: idx = {idx}")
60
+
61
+ # save metadata of all extracted frames
62
+ fn = osp.join(path_to_save_metadatas, 'metadatas.json')
63
+ with open(fn, 'w') as outfile:
64
+ json.dump(metadatas, outfile)
65
+ return metadatas
src/preprocessing_video.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
src/shared_data/videos/video1/7Hcg-rLYwdM.en.vtt ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+ Kind: captions
3
+ Language: en
4
+
5
+ 00:00:03.620 --> 00:00:06.879
6
+ As I look back on the the mission that we've had here
7
+
8
+ 00:00:06.879 --> 00:00:10.559
9
+ on the International Space Station,
10
+ I'm proud to have been a part of much of
11
+
12
+ 00:00:10.559 --> 00:00:13.679
13
+ the science activities that happened over the last
14
+
15
+ 00:00:13.680 --> 00:00:14.420
16
+ two months.
17
+
18
+ 00:00:14.420 --> 00:00:15.780
19
+ The view is always amazing
20
+
21
+ 00:00:15.780 --> 00:00:17.520
22
+ I didn't think I would do another
23
+
24
+ 00:00:17.520 --> 00:00:20.720
25
+ spacewalk and to now have the chance to have done
26
+
27
+ 00:00:20.720 --> 00:00:23.840
28
+ four more was just icing on the cake for a
29
+
30
+ 00:00:23.840 --> 00:00:24.900
31
+ a wonderful mission.
32
+
33
+ 00:00:24.900 --> 00:00:26.900
34
+ Does the 10th one feel like the first one?
35
+
36
+ 00:00:26.960 --> 00:00:30.160
37
+ No, a little more comfortable on the tenth one.
38
+
39
+ 00:00:30.160 --> 00:00:33.300
40
+ It's hard to put into words
41
+
42
+ 00:00:33.420 --> 00:00:38.480
43
+ just what it was like to be a part of
44
+ this expedition, expedition 63. It'll be
45
+
46
+ 00:00:38.480 --> 00:00:40.399
47
+ kind of a memory that will last a
48
+
49
+ 00:00:40.400 --> 00:00:43.260
50
+ lifetime for me. It's been a true honor.
51
+
52
+ 00:00:43.260 --> 00:00:44.800
53
+ Dragon SpaceX
54
+
55
+ 00:00:44.800 --> 00:00:48.160
56
+ undock sequence commanded. Thrusters
57
+ looking good.
58
+
59
+ 00:00:48.160 --> 00:00:50.440
60
+ The hardest part was getting us launched,
61
+
62
+ 00:00:50.440 --> 00:00:53.080
63
+ but the most important part is bringing us home.
64
+
65
+ 00:00:56.040 --> 00:00:59.180
66
+ Rise and shine Daddy. We love you.
67
+
68
+ 00:00:59.540 --> 00:01:03.160
69
+ Hurry home so we can go get my dog.
70
+
71
+ 00:01:06.040 --> 00:01:07.920
72
+ Splashdown!
73
+
74
+ 00:01:07.920 --> 00:01:11.200
75
+ Welcome back to planet Earth and thanks for flying SpaceX.
76
+
77
+ 00:01:11.200 --> 00:01:12.940
78
+ It's truly our honor and privilege.
79
+
80
+ 00:01:12.940 --> 00:01:14.800
81
+ Space Dads are back on Earth
82
+
83
+ 00:01:14.800 --> 00:01:19.140
84
+ after a 19-hour return journey from space.
85
+
src/shared_data/videos/video1/Welcome back to Planet Earth.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d90d6a55ba3a7c2c15ed78977df2721f67c0f907957d50688d8b695cf662c500
3
+ size 4578531
src/shared_data/videos/video1/audio.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d757fa88232111a0f1ed24ae0e23a4143479391ff7a71e7255bdc52283496d6
3
+ size 1434687
src/shared_data/videos/video1/extracted_frame/frame_0.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_1.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_10.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_11.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_12.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_13.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_14.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_15.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_16.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_17.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_18.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_19.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_2.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_20.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_21.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_22.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_23.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_24.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_25.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_3.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_4.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_5.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_6.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_7.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_8.jpg ADDED
src/shared_data/videos/video1/extracted_frame/frame_9.jpg ADDED
src/shared_data/videos/video1/generated_captions.vtt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 00:00.000 --> 00:08.780
4
+ As I look back on the mission that we've had here on the International Space Station,
5
+
6
+ 00:08.780 --> 00:13.300
7
+ I'm proud to have been a part of much of the science activities that happened over the
8
+
9
+ 00:13.300 --> 00:14.300
10
+ last two months.
11
+
12
+ 00:14.300 --> 00:16.180
13
+ The view is always amazing though.
14
+
15
+ 00:16.180 --> 00:21.260
16
+ I didn't think I would do another spacewalk and to now have the chance to have done four
17
+
18
+ 00:21.260 --> 00:24.980
19
+ more was just icing on the cake for a wonderful mission.
20
+
21
+ 00:25.480 --> 00:26.980
22
+ The tenth one, do you like the first one?
23
+
24
+ 00:26.980 --> 00:27.980
25
+ No.
26
+
27
+ 00:27.980 --> 00:30.280
28
+ A little more comfortable on your tenth one.
29
+
30
+ 00:30.280 --> 00:36.980
31
+ It's hard to put into words just what it was like to be a part of this expedition, the
32
+
33
+ 00:36.980 --> 00:37.980
34
+ Expedition 63.
35
+
36
+ 00:37.980 --> 00:42.280
37
+ It'll be kind of a memory that will last a lifetime for me.
38
+
39
+ 00:42.280 --> 00:43.780
40
+ It's been a true honor.
41
+
42
+ 00:43.780 --> 00:46.780
43
+ Dragon SpaceX, Undock sequence commanded.
44
+
45
+ 00:46.780 --> 00:48.340
46
+ The roster's looking good.
47
+
48
+ 00:48.340 --> 00:52.900
49
+ The hardest part was getting us launched, but the most important part is bringing us home.
50
+
51
+ 00:55.980 --> 00:58.980
52
+ I've been telling Daddy we love you.
53
+
54
+ 00:58.980 --> 01:02.980
55
+ Hurry home so we can go get my dog.
56
+
57
+ 01:05.980 --> 01:07.980
58
+ Flashdown.
59
+
60
+ 01:07.980 --> 01:10.980
61
+ Welcome back to Planet Earth and thanks for flying SpaceX.
62
+
63
+ 01:10.980 --> 01:12.980
64
+ It was truly our honor and privilege.
65
+
66
+ 01:12.980 --> 01:17.980
67
+ Space dads are back on Earth after a 19 hour return journey from space.
68
+
69
+ 01:24.980 --> 01:27.980
70
+ You
71
+