Spaces:

doggdad
/

multimodal-rag

Sleeping

App Files Files Community

doggdad commited on Aug 26, 2025

Commit

843111c

verified ·

1 Parent(s): f4dd0da

Upload 51 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
src/__pycache__/utils.cpython-313.pyc +0 -0
src/app.py +288 -0
src/crud/__pycache__/vector_store.cpython-313.pyc +0 -0
src/crud/vector_store.py +140 -0
src/data/images/car_1.jpg +3 -0
src/data/images/car_2.jpg +3 -0
src/data/images/cat_1.jpg +0 -0
src/data/images/cat_2.jpg +0 -0
src/data/images/cat_3.jpg +0 -0
src/data/images/motorcycle_1.jpg +0 -0
src/data/images/motorcycle_2.jpg +3 -0
src/data/images/motorcycle_3.jpg +3 -0
src/embedding_creation.ipynb +3 -0
src/mm_rag.ipynb +0 -0
src/preprocess/__pycache__/embedding.cpython-313.pyc +0 -0
src/preprocess/__pycache__/preprocessing.cpython-313.pyc +0 -0
src/preprocess/embedding.py +69 -0
src/preprocess/preprocessing.py +65 -0
src/preprocessing_video.ipynb +0 -0
src/shared_data/videos/video1/7Hcg-rLYwdM.en.vtt +85 -0
src/shared_data/videos/video1/Welcome back to Planet Earth.mp4 +3 -0
src/shared_data/videos/video1/audio.mp3 +3 -0
src/shared_data/videos/video1/extracted_frame/frame_0.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_1.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_10.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_11.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_12.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_13.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_14.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_15.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_16.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_17.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_18.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_19.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_2.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_20.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_21.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_22.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_23.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_24.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_25.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_3.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_4.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_5.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_6.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_7.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_8.jpg +0 -0
src/shared_data/videos/video1/extracted_frame/frame_9.jpg +0 -0
src/shared_data/videos/video1/generated_captions.vtt +71 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/data/images/car_1.jpg filter=lfs diff=lfs merge=lfs -text
+src/data/images/car_2.jpg filter=lfs diff=lfs merge=lfs -text
+src/data/images/motorcycle_2.jpg filter=lfs diff=lfs merge=lfs -text
+src/data/images/motorcycle_3.jpg filter=lfs diff=lfs merge=lfs -text
+src/embedding_creation.ipynb filter=lfs diff=lfs merge=lfs -text
+src/shared_data/videos/video1/audio.mp3 filter=lfs diff=lfs merge=lfs -text
+src/shared_data/videos/video1/Welcome[[:space:]]back[[:space:]]to[[:space:]]Planet[[:space:]]Earth.mp4 filter=lfs diff=lfs merge=lfs -text

src/__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (13.8 kB). View file

src/app.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from pathlib import Path
+import os
+from os import path as osp
+import gradio as gr
+from dotenv import load_dotenv
+from crud.vector_store import MultimodalLanceDB
+from preprocess.embedding import BridgeTowerEmbeddings
+from preprocess.preprocessing import extract_and_save_frames_and_metadata
+#from utils import encode_image
+from utils import (
+    download_video,
+    get_transcript_vtt,
+    download_youtube_subtitle,
+    get_video_id_from_url,
+    str2time,
+    maintain_aspect_ratio_resize,
+    getSubs,
+    encode_image,
+)
+from mistralai import Mistral
+from langchain_core.runnables import (
+    RunnableParallel,
+    RunnablePassthrough,
+    RunnableLambda
+)
+from PIL import Image
+import lancedb
+# -------------------------------
+# 1. Setup
+# -------------------------------
+load_dotenv()
+if os.getenv("SPACE_ID"):
+    LANCEDB_HOST_FILE = "/tmp/.lancedb"
+    os.makedirs("/tmp", exist_ok=True)
+else:
+    LANCEDB_HOST_FILE = "./shared_data/.lancedb"
+TBL_NAME = "vectorstore"
+db = lancedb.connect(LANCEDB_HOST_FILE)
+embedder = BridgeTowerEmbeddings()
+# -------------------------------
+# 2. Preprocessing + Storage
+# -------------------------------
+def preprocess_and_store(youtube_url: str):
+    """Download video, extract frames+metadata, embed & store in LanceDB"""
+    video_url = youtube_url
+    if os.getenv("SPACE_ID"):
+        video_dir = "/tmp/videos/video1"
+    else:
+        video_dir = "./shared_data/videos/video1"
+    # download Youtube video to ./shared_data/videos/video1
+    video_filepath = download_video(video_url, video_dir)
+    # download Youtube video's subtitle to ./shared_data/videos/video1
+    video_transcript_filepath = download_youtube_subtitle(video_url, video_dir)
+    extracted_frames_path = osp.join(video_dir, 'extracted_frame')
+    # create these output folders if not existing
+    Path(extracted_frames_path).mkdir(parents=True, exist_ok=True)
+    Path(video_dir).mkdir(parents=True, exist_ok=True)
+    # call the function to extract frames and metadatas
+    metadatas = extract_and_save_frames_and_metadata(
+                video_filepath,
+                video_transcript_filepath,
+                extracted_frames_path,
+                video_dir,
+            )
+    # collect transcripts and image paths
+    video_trans = [vid['transcript'] for vid in metadatas]
+    video_img_path = [vid['extracted_frame_path'] for vid in metadatas]
+    n = 7
+    updated_video_trans = [
+    ' '.join(video_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
+    ' '.join(video_trans[0 : i + int(n/2)]) for i in range(len(video_trans))
+    ]
+    # also need to update the updated transcripts in metadata
+    for i in range(len(updated_video_trans)):
+        metadatas[i]['transcript'] = updated_video_trans[i]
+    _ = MultimodalLanceDB.from_text_image_pairs(
+        texts=updated_video_trans,
+        image_paths=video_img_path,
+        embedding=embedder,
+        metadatas=metadatas,
+        connection=db,
+        table_name=TBL_NAME,
+        mode="overwrite",
+    )
+    return f"✅ Video processed and stored: {youtube_url}"
+# -------------------------------
+# 3. Retrieval + Prompt Functions
+# -------------------------------
+vectorstore = MultimodalLanceDB(
+    uri=LANCEDB_HOST_FILE,
+    embedding=embedder,
+    table_name=TBL_NAME
+)
+retriever_module = vectorstore.as_retriever(
+    search_type="similarity",
+    search_kwargs={"k": 3}
+)
+def prompt_processing(input):
+    retrieved_results = input["retrieved_results"]
+    user_query = input["user_query"]
+    #retrieved_results = retriever_module.invoke(user_query)
+    retrieved_results = retrieved_results[0]
+    prompt_template = (
+        "The transcript associated with the image is '{transcript}'. "
+        "{user_query}"
+    )
+    retrieved_metadata = retrieved_results.metadata
+    transcript = retrieved_metadata["transcript"]
+    frame_path = retrieved_metadata["extracted_frame_path"]
+    return {
+        "prompt": prompt_template.format(transcript=transcript, user_query=user_query),
+        "frame_path": frame_path,
+    }
+def lvlm_inference(input):
+    # get the retrieved results and user's query
+    lvlm_prompt = input['prompt']
+    frame_path = input['frame_path']
+    # Retrieve the API key from environment variables
+    api_key = os.getenv("MISTRAL_API_KEY")
+    # Initialize the Mistral client
+    client = Mistral(api_key=api_key)
+    base64_image = encode_image(frame_path)
+    # Define the messages for the chat
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": lvlm_prompt
+                },
+                {
+                    "type": "image_url",
+                    "image_url": f"data:image/jpeg;base64,{base64_image}"
+                }
+            ]
+        }
+    ]
+    # Get the chat response
+    chat_response = client.chat.complete(
+        model="pixtral-12b-2409",
+        messages=messages
+    )
+    # Print the content of the response
+    return chat_response.choices[0].message.content, frame_path
+# LangChain Runnable chain
+prompt_processing_module = RunnableLambda(prompt_processing)
+lvlm_inference_module = RunnableLambda(lvlm_inference)
+mm_rag_chain = (
+    RunnableParallel({"retrieved_results": retriever_module, "user_query": RunnablePassthrough()})
+    | prompt_processing_module
+    | lvlm_inference_module
+)
+# -------------------------------
+# 4. Chat API for Gradio
+# -------------------------------
+video_loaded = False
+def load_video(youtube_url):
+    global video_loaded
+    status = preprocess_and_store(youtube_url)
+    video_loaded = True
+    return status
+def chat_interface(message, history):
+    if not video_loaded:
+        return "", history, None
+    final_text_response, frame_path = mm_rag_chain.invoke(message)
+    history.append((message, final_text_response))
+    # Load and return the image
+    try:
+        retrieved_image = Image.open(frame_path)
+    except Exception as e:
+        print(f"Error loading image: {e}")
+        retrieved_image = None
+    return "", history, retrieved_image
+# -------------------------------
+# 5. Enhanced Gradio Interface
+# -------------------------------
+with gr.Blocks(title="Multimodal RAG Video Chat") as demo:
+    gr.Markdown("# 🎬 Multimodal RAG Video Chat\nChat with YouTube clips using BridgeTower + LanceDB + Pixtral!")
+    with gr.Tab("1. Load Video"):
+        youtube_url = gr.Textbox(
+            label="YouTube URL",
+            placeholder="Paste a YouTube link here...",
+            lines=1
+        )
+        load_btn = gr.Button("Process Video", variant="primary")
+        status = gr.Textbox(label="Status", interactive=False)
+        load_btn.click(load_video, inputs=youtube_url, outputs=status)
+    with gr.Tab("2. Chat with Video"):
+        with gr.Row():
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    label="Chat about the video",
+                    height=500,
+                    show_label=True
+                )
+            with gr.Column(scale=1):
+                retrieved_image = gr.Image(
+                    label="Retrieved Frame",
+                    height=400,
+                    show_label=True,
+                    interactive=False
+                )
+        with gr.Row():
+            msg = gr.Textbox(
+                label="Your question",
+                placeholder="Ask something about the video...",
+                lines=2,
+                scale=4
+            )
+            send_btn = gr.Button("Send", variant="primary", scale=1)
+        # Clear message after sending
+        msg.submit(chat_interface, inputs=[msg, chatbot], outputs=[msg, chatbot, retrieved_image])
+        send_btn.click(chat_interface, inputs=[msg, chatbot], outputs=[msg, chatbot, retrieved_image])
+    # Add some usage instructions
+    with gr.Tab("📖 Instructions"):
+        gr.Markdown("""
+        ## How to use this Multimodal RAG system:
+        1. **Load Video**:
+           - Go to the "Load Video" tab
+           - Paste a YouTube URL
+           - Click "Process Video" and wait for processing to complete
+        2. **Chat with Video**:
+           - Go to the "Chat with Video" tab
+           - Ask questions about the video content
+           - The system will retrieve the most relevant frame and provide answers
+           - The retrieved frame will be displayed on the right side
+        ## Features:
+        - 🎥 Processes YouTube videos automatically
+        - 🧠 Uses BridgeTower for multimodal embeddings
+        - 💾 Stores data in LanceDB vector database
+        - 🤖 Powered by Pixtral vision-language model
+        - 🖼️ Shows relevant video frames alongside responses
+        """)
+if __name__ == "__main__":
+    print('App starting...')
+    # For HF Spaces, use default host and port
+    if os.getenv("SPACE_ID"):
+        demo.launch()
+    else:
+        demo.launch(server_name="0.0.0.0", server_port=7860)

src/crud/__pycache__/vector_store.cpython-313.pyc ADDED Viewed

Binary file (6.15 kB). View file

src/crud/vector_store.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import Any, Iterable, List, Optional
+from langchain_core.embeddings import Embeddings
+import uuid
+from langchain_community.vectorstores.lancedb import LanceDB
+class MultimodalLanceDB(LanceDB):
+    """`LanceDB` vector store to process multimodal data
+    Parameters:
+    -----------
+        connection: Any
+            LanceDB connection to use. If not provided, a new connection will be created.
+        embedding: Embeddings
+            Embedding to use for the vectorstore.
+        vector_key: str
+            Key to use for the vector in the database. Defaults to ``vector``.
+        id_key: str
+            Key to use for the id in the database. Defaults to ``id``.
+        text_key: str
+            Key to use for the text in the database. Defaults to ``text``.
+        image_path_key: str
+            Key to use for the path to image in the database. Defaults to ``image_path``.
+        table_name: str
+            Name of the table to use. Defaults to ``vectorstore``.
+        api_key: str
+            API key to use for LanceDB cloud database.
+        region: str
+            Region to use for LanceDB cloud database.
+        mode: str
+            Mode to use for adding data to the table. Defaults to ``overwrite``.
+    """
+    def __init__(
+        self,
+        connection: Optional[Any] = None,
+        embedding: Optional[Embeddings] = None,
+        uri: Optional[str] = "/tmp/lancedb",
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        api_key: Optional[str] = None,
+        region: Optional[str] = None,
+        mode: Optional[str] = "append",
+    ):
+        super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
+        self._image_path_key = image_path_key
+    def add_text_image_pairs(
+        self,
+        texts: Iterable[str],
+        image_paths: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Turn text-image pairs into embedding and add it to the database
+        Parameters:
+        ----------
+            texts: Iterable[str]
+                Iterable of strings to combine with corresponding images to add to the vectorstore.
+            images: Iterable[str]
+                Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
+            metadatas: List[str]
+                Optional list of metadatas associated with the texts.
+            ids: List[str]
+                Optional list of ids to associate with the texts.
+        Returns:
+        --------
+            List of ids of the added text-image pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
+        print(f'The length of texts is {len(texts)}')
+        # Embed texts and create documents
+        docs = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths))  # type: ignore
+        for idx, text in enumerate(texts):
+            embedding = embeddings[idx]
+            metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
+            docs.append(
+                {
+                    self._vector_key: embedding,
+                    self._id_key: ids[idx],
+                    self._text_key: text,
+                    self._image_path_key : image_paths[idx],
+                    "metadata": metadata,
+                }
+            )
+        print(f'Adding {len(docs)} text-image pairs to the vectorstore...')
+        if 'mode' in kwargs:
+            mode = kwargs['mode']
+        else:
+            mode = self.mode
+        if self._table_name in self._connection.table_names():
+            tbl = self._connection.open_table(self._table_name)
+            if self.api_key is None:
+                tbl.add(docs)
+            else:
+                tbl.add(docs)
+        else:
+            self._connection.create_table(self._table_name, data=docs)
+        return ids
+    @classmethod
+    def from_text_image_pairs(
+        cls,
+        texts: List[str],
+        image_paths: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        connection: Any = None,
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        **kwargs: Any,
+    ):
+        instance = MultimodalLanceDB(
+            connection=connection,
+            embedding=embedding,
+            vector_key=vector_key,
+            id_key=id_key,
+            text_key=text_key,
+            image_path_key=image_path_key,
+            table_name=table_name,
+        )
+        instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
+        return instance

src/data/images/car_1.jpg ADDED Viewed

Git LFS Details

SHA256: 7093f1c09568aece012f93050a14f2c272f4f35f16d88030ccf2ac6a88d19f28
Pointer size: 132 Bytes
Size of remote file: 1.52 MB

src/data/images/car_2.jpg ADDED Viewed

Git LFS Details

SHA256: b1baf7d58f14bcbb6c0ac143c93f1b3b8972a6f544a81790e54594142888f6cc
Pointer size: 132 Bytes
Size of remote file: 1.6 MB

src/data/images/cat_1.jpg ADDED Viewed

src/data/images/cat_2.jpg ADDED Viewed

src/data/images/cat_3.jpg ADDED Viewed

src/data/images/motorcycle_1.jpg ADDED Viewed

src/data/images/motorcycle_2.jpg ADDED Viewed

Git LFS Details

SHA256: cebf0ac3e43fa6246fc07de3d147aac3678083efa66dcb2304488d7a8754ce2e
Pointer size: 131 Bytes
Size of remote file: 114 kB

src/data/images/motorcycle_3.jpg ADDED Viewed

Git LFS Details

SHA256: 6cedc5acbaa790d4a26fe35c4bde211ec94f6e30d67f77b6324e3b5f30338048
Pointer size: 132 Bytes
Size of remote file: 1.3 MB

src/embedding_creation.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8d2b46e0c0b041904c02be7a0878a8b6b59e0ee98fff649bd8a7b38134c2dc6
+size 47954568

src/mm_rag.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/preprocess/__pycache__/embedding.cpython-313.pyc ADDED Viewed

Binary file (2.9 kB). View file

src/preprocess/__pycache__/preprocessing.cpython-313.pyc ADDED Viewed

Binary file (2.19 kB). View file

src/preprocess/embedding.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from utils import encode_image
+from utils import bt_embeddings
+from tqdm import tqdm
+from typing import List
+from langchain_core.embeddings import Embeddings
+from langchain_core.pydantic_v1 import BaseModel
+class BridgeTowerEmbeddings(BaseModel,Embeddings):
+    """ BridgeTower embedding model """
+    def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
+        """Embed a list of image-text pairs using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The list of texts to embed.
+        images: List
+            The list of path-to-images to embed
+        batch_size: int
+            The batch size to process, default to 2
+        Returns:
+        --------
+            List of embeddings, one for each image-text pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(images), "the len of captions should be equal to the len of images"
+        print(f"Embedding {len(texts)} image-text pairs...")
+        embeddings = []
+        for path_to_img, text in tqdm(zip(images, texts), total=len(images), desc="Processing pairs"):
+            embedding = bt_embeddings(text, encode_image(path_to_img))
+            embeddings.append(embedding)
+        return embeddings
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The list of texts to embed.
+        Returns:
+        --------
+            List of embeddings, one for each text.
+        """
+        embeddings = []
+        for text in texts:
+            embedding = bt_embeddings(text, "")
+            embeddings.append(embedding)
+        return embeddings
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a query using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The text to embed.
+        Returns:
+            Embeddings for the text.
+        """
+        return self.embed_documents([text])[0]

src/preprocess/preprocessing.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from os import path as osp
+import json
+import cv2
+import webvtt
+from utils import maintain_aspect_ratio_resize, str2time
+def extract_and_save_frames_and_metadata(
+        path_to_video,
+        path_to_transcript,
+        path_to_save_extracted_frames,
+        path_to_save_metadatas):
+    # metadatas will store the metadata of all extracted frames
+    metadatas = []
+    # load video using cv2
+    video = cv2.VideoCapture(path_to_video)
+    # load transcript using webvtt
+    trans = webvtt.read(path_to_transcript)
+    # iterate transcript file
+    # for each video segment specified in the transcript file
+    for idx, transcript in enumerate(trans):
+        # get the start time and end time in seconds
+        start_time_ms = str2time(transcript.start)
+        end_time_ms = str2time(transcript.end)
+        # get the time in ms exactly
+        # in the middle of start time and end time
+        mid_time_ms = (end_time_ms + start_time_ms) / 2
+        # get the transcript, remove the next-line symbol
+        text = transcript.text.replace("\n", ' ')
+        # get frame at the middle time
+        video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
+        success, frame = video.read()
+        if success:
+            # if the frame is extracted successfully, resize it
+            image = maintain_aspect_ratio_resize(frame, height=350)
+            # save frame as JPEG file
+            img_fname = f'frame_{idx}.jpg'
+            img_fpath = osp.join(
+                path_to_save_extracted_frames, img_fname
+            )
+            cv2.imwrite(img_fpath, image)
+            # prepare the metadata
+            metadata = {
+                'extracted_frame_path': img_fpath,
+                'transcript': text,
+                'video_segment_id': idx,
+                'video_path': path_to_video,
+                'mid_time_ms': mid_time_ms,
+            }
+            metadatas.append(metadata)
+        else:
+            print(f"ERROR! Cannot extract frame: idx = {idx}")
+    # save metadata of all extracted frames
+    fn = osp.join(path_to_save_metadatas, 'metadatas.json')
+    with open(fn, 'w') as outfile:
+        json.dump(metadatas, outfile)
+    return metadatas

src/preprocessing_video.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/shared_data/videos/video1/7Hcg-rLYwdM.en.vtt ADDED Viewed

	@@ -0,0 +1,85 @@

+WEBVTT
+Kind: captions
+Language: en
+00:00:03.620 --> 00:00:06.879
+As I look back on the the mission that we've had here
+00:00:06.879 --> 00:00:10.559
+on the International Space Station,
+I'm proud to have been a part of much of
+00:00:10.559 --> 00:00:13.679
+the science activities that happened over the last
+00:00:13.680 --> 00:00:14.420
+two months.
+00:00:14.420 --> 00:00:15.780
+The view is always amazing
+00:00:15.780 --> 00:00:17.520
+I didn't think I would do another
+00:00:17.520 --> 00:00:20.720
+spacewalk and to now have the chance to have done
+00:00:20.720 --> 00:00:23.840
+four more was just icing on the cake for a
+00:00:23.840 --> 00:00:24.900
+a wonderful mission.
+00:00:24.900 --> 00:00:26.900
+Does the 10th one feel like the first one?
+00:00:26.960 --> 00:00:30.160
+No, a little more comfortable on the tenth one.
+00:00:30.160 --> 00:00:33.300
+It's hard to put into words
+00:00:33.420 --> 00:00:38.480
+just what it was like to be a part of
+this expedition, expedition 63. It'll be
+00:00:38.480 --> 00:00:40.399
+kind of a memory that will last a
+00:00:40.400 --> 00:00:43.260
+lifetime for me. It's been a true honor.
+00:00:43.260 --> 00:00:44.800
+Dragon SpaceX
+00:00:44.800 --> 00:00:48.160
+undock sequence commanded. Thrusters
+looking good.
+00:00:48.160 --> 00:00:50.440
+The hardest part was getting us launched,
+00:00:50.440 --> 00:00:53.080
+but the most important part is bringing us home.
+00:00:56.040 --> 00:00:59.180
+Rise and shine Daddy. We love you.
+00:00:59.540 --> 00:01:03.160
+Hurry home so we can go get my dog.
+00:01:06.040 --> 00:01:07.920
+Splashdown!
+00:01:07.920 --> 00:01:11.200
+Welcome back to planet Earth and thanks for flying SpaceX.
+00:01:11.200 --> 00:01:12.940
+It's truly our honor and privilege.
+00:01:12.940 --> 00:01:14.800
+Space Dads are back on Earth
+00:01:14.800 --> 00:01:19.140
+after a 19-hour return journey from space.

src/shared_data/videos/video1/Welcome back to Planet Earth.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d90d6a55ba3a7c2c15ed78977df2721f67c0f907957d50688d8b695cf662c500
+size 4578531

src/shared_data/videos/video1/audio.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d757fa88232111a0f1ed24ae0e23a4143479391ff7a71e7255bdc52283496d6
+size 1434687

src/shared_data/videos/video1/extracted_frame/frame_0.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_1.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_10.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_11.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_12.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_13.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_14.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_15.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_16.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_17.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_18.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_19.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_2.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_20.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_21.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_22.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_23.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_24.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_25.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_3.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_4.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_5.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_6.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_7.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_8.jpg ADDED Viewed

src/shared_data/videos/video1/extracted_frame/frame_9.jpg ADDED Viewed

src/shared_data/videos/video1/generated_captions.vtt ADDED Viewed

	@@ -0,0 +1,71 @@

+WEBVTT
+00:00.000 --> 00:08.780
+ As I look back on the mission that we've had here on the International Space Station,
+00:08.780 --> 00:13.300
+ I'm proud to have been a part of much of the science activities that happened over the
+00:13.300 --> 00:14.300
+ last two months.
+00:14.300 --> 00:16.180
+ The view is always amazing though.
+00:16.180 --> 00:21.260
+ I didn't think I would do another spacewalk and to now have the chance to have done four
+00:21.260 --> 00:24.980
+ more was just icing on the cake for a wonderful mission.
+00:25.480 --> 00:26.980
+ The tenth one, do you like the first one?
+00:26.980 --> 00:27.980
+ No.
+00:27.980 --> 00:30.280
+ A little more comfortable on your tenth one.
+00:30.280 --> 00:36.980
+ It's hard to put into words just what it was like to be a part of this expedition, the
+00:36.980 --> 00:37.980
+ Expedition 63.
+00:37.980 --> 00:42.280
+ It'll be kind of a memory that will last a lifetime for me.
+00:42.280 --> 00:43.780
+ It's been a true honor.
+00:43.780 --> 00:46.780
+ Dragon SpaceX, Undock sequence commanded.
+00:46.780 --> 00:48.340
+ The roster's looking good.
+00:48.340 --> 00:52.900
+ The hardest part was getting us launched, but the most important part is bringing us home.
+00:55.980 --> 00:58.980
+ I've been telling Daddy we love you.
+00:58.980 --> 01:02.980
+ Hurry home so we can go get my dog.
+01:05.980 --> 01:07.980
+ Flashdown.
+01:07.980 --> 01:10.980
+ Welcome back to Planet Earth and thanks for flying SpaceX.
+01:10.980 --> 01:12.980
+ It was truly our honor and privilege.
+01:12.980 --> 01:17.980
+ Space dads are back on Earth after a 19 hour return journey from space.
+01:24.980 --> 01:27.980
+ You