Perflow-Shuai
/

nvila_lite_2b_dev-model

@@ -11,7 +11,7 @@ import requests
 from transformers import PretrainedConfig
 # from llava.constants import MEDIA_TOKENS
-# from llava.media import Image, Video
 # from llava.utils import make_list
 # from llava.utils.logging import logger
@@ -31,9 +31,6 @@ class Image(File):
     pass
-class Video(File):
-    pass
 def make_list(obj: Any) -> List:
     return obj if isinstance(obj, list) else [obj]

 from transformers import PretrainedConfig
 # from llava.constants import MEDIA_TOKENS
+from llava.media import Image, Video
 # from llava.utils import make_list
 # from llava.utils.logging import logger
     pass
 def make_list(obj: Any) -> List:
     return obj if isinstance(obj, list) else [obj]

modeling_vila.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import copy
 import json
 import logging
@@ -142,14 +143,97 @@ class VILAPretrainedModel(PreTrainedModel):
             self.llm is not None or self.vision_tower is not None or self.mm_projector is not None
         ), "At least one of the components must be instantiated."
     @classmethod
-    def save_pretrained(
-        cls,
-    ):
-        raise NotImplementedError
-        pass
     @classmethod
     def from_pretrained(
@@ -202,6 +286,16 @@ class VILAPretrainedModel(PreTrainedModel):
         if getattr(self.config, "mm_projector_cfg", None) is None:
             self.config.mm_projector_cfg = self.mm_projector.config
     def get_vision_tower(self):
         vision_tower = getattr(self, "vision_tower", None)
         if type(vision_tower) is list:
@@ -408,7 +502,7 @@ class VILAForCasualLM(VILAPretrainedModel):
             if self.training:
                 # Gather metainfo of media objects from all ranks
                 info = [{"shape": tensor.shape, "dtype": tensor.dtype} for tensor in media.get(name, [])]
-                infos = list(chain(*all_gather(info)))
                 # The entire batch does not contain any media objects of this type.
                 if not infos:
@@ -750,7 +844,7 @@ class VILAForCasualLM(VILAPretrainedModel):
         if images is not None:
             if media is not None:
                 raise ValueError("Both 'media' and 'images' are provided. Please provide only one.")
-            logger.warning("The 'images' argument is deprecated. Please use 'media' instead.")
             media = {"image": images}
         if media_config is None:
@@ -845,7 +939,7 @@ class VILAForCasualLM(VILAPretrainedModel):
                     images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
                 media[name] = [image for image in images]
             elif name == "video":
-                if self.config.image_aspect_ratio == "dynamic" and self.config.video_max_tiles > 1:
                     media[name] = [
                         process_images(
                             images,
@@ -856,7 +950,7 @@ class VILAForCasualLM(VILAPretrainedModel):
                         ).half()
                         for images in media[name]
                     ]
-                elif self.config.image_aspect_ratio == "dynamic_s2" and self.config.video_max_tiles > 1:
                     self.config.image_processor = self.vision_tower.image_processor
                     if type(self.config.s2_scales) is str:
                         self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
@@ -930,3 +1024,4 @@ class VILAForCasualLM(VILAPretrainedModel):
         if generation_config.eos_token_id is None:
             generation_config.eos_token_id = self.tokenizer.eos_token_id
         return generation_config

+import shutil
 import copy
 import json
 import logging
             self.llm is not None or self.vision_tower is not None or self.mm_projector is not None
         ), "At least one of the components must be instantiated."
     @classmethod
+    def convert_vila_dev_ckpt_to_remote(self, model_path: str, output_dir:str = None, *model_args, **kwargs):
+        # assert type(self) == VILAForCasualLM, "This method is only available for VILAForCasualLM."
+        from huggingface_hub import HfApi, snapshot_download
+        if os.path.isdir(model_path):
+            model_path = model_path
+        api = HfApi()
+        if api.repo_exists(model_path):
+            model_path = snapshot_download(model_path, local_dir=output_dir)
+            print("downloading HF model to", model_path)
+        cfg_path = os.path.join(model_path, "config.json")
+        config = json.load(open(cfg_path))
+        config["version"] = "2.0" # nvila tag
+        config["architectures"] = ["VILAForCasualLM"]
+        config["auto_map"] = {
+            "AutoConfig": "modeling_vila.VILAConfig",
+            "AutoModel": "modeling_vila.VILAForCasualLM",
+            "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM"
+        }
+        config["model_type"] = "vila"
+        json.dump(config, open(cfg_path, "w"), indent=2)
+        self.copy_remote_py_files(model_path)
+    @classmethod
+    def copy_remote_py_files(cls, output_dir):
+        ## copy .py and REAMDE for next loading remote code
+        current_file_path = os.path.abspath(__file__)
+        current_folder = os.path.dirname(current_file_path)
+        for file_name in os.listdir(current_folder):
+            if file_name.endswith(".py"):
+                full_file_name = os.path.join(current_folder, file_name)
+                if os.path.isfile(full_file_name):
+                    shutil.copy(full_file_name, output_dir)
+                    print("[HF remote code] copying", full_file_name, "to", output_dir)
+    def save_pretrained(self, output_dir, state_dict=None, safe_serialization=None):
+        if state_dict is None:
+            # other wise fetch from deepspeed
+            # state_dict = accelerator.get_state_dict(is_deepspeed_enabled)
+            state_dict = self.state_dict()
+        if getattr(self, "tokenizer", None):
+            self.tokenizer.save_pretrained(osp.join(output_dir, "llm"))
+        if self.get_llm():
+            print(f"saving llm to {osp.join(output_dir, 'llm')}")
+            self.llm.config._name_or_path = osp.join(output_dir, "llm")
+            llm_state_dict = OrderedDict({k.split("llm.")[-1]: v for k, v in state_dict.items() if "llm" in k})
+            self.llm.save_pretrained(os.path.join(output_dir, "llm"), state_dict=llm_state_dict)
+            self.config.llm_cfg = self.llm.config
+        if self.get_vision_tower():
+            print(f"saving vision_tower to {osp.join(output_dir, 'vision_tower')}")
+            self.vision_tower.config._name_or_path = osp.join(output_dir, "vision_tower")
+            vision_tower_state_dict = OrderedDict(
+                {k.split("vision_tower.vision_tower.")[-1]: v for k, v in state_dict.items() if "vision_tower" in k}
+            )
+            self.vision_tower.vision_tower.save_pretrained(
+                os.path.join(output_dir, "vision_tower"),
+                state_dict=vision_tower_state_dict,
+            )
+            self.vision_tower.image_processor.save_pretrained(os.path.join(output_dir, "vision_tower"))
+            self.config.vision_tower_cfg = self.vision_tower.config
+            if hasattr(self.config.vision_tower_cfg, "auto_map"):
+                if "radio" not in self.get_vision_tower().__class__.__name__.lower():
+                    delattr(self.config.vision_tower_cfg, "auto_map")
+        if self.get_mm_projector():
+            print(f"saving mm_projector to {osp.join(output_dir, 'mm_projector')}")
+            self.mm_projector.config._name_or_path = osp.join(output_dir, "mm_projector")
+            mm_projector_state_dict = OrderedDict(
+                {k.split("mm_projector.")[-1]: v for k, v in state_dict.items() if "mm_projector" in k}
+            )
+            self.mm_projector.save_pretrained(
+                os.path.join(output_dir, "mm_projector"),
+                state_dict=mm_projector_state_dict,
+            )
+            self.config.mm_projector_cfg = self.mm_projector.config
+        ## update and save top-level config
+        self.config._name_or_path = output_dir
+        self.config.architectures = [self.__class__.__name__]
+        #print(self.config)
+        #self.config.save_pretrained(output_dir)
+        ## copy .py and REAMDE for next loading remote code
+        self.copy_remote_py_files(output_dir)
     @classmethod
     def from_pretrained(
         if getattr(self.config, "mm_projector_cfg", None) is None:
             self.config.mm_projector_cfg = self.mm_projector.config
+    def get_llm(self):
+        llm = getattr(self, "llm", None)
+        if type(llm) is list:
+            llm = llm[0]
+        return llm
+    def get_lm_head(self):
+        lm_head = getattr(self.get_llm(), "lm_head", None)
+        return lm_head
     def get_vision_tower(self):
         vision_tower = getattr(self, "vision_tower", None)
         if type(vision_tower) is list:
             if self.training:
                 # Gather metainfo of media objects from all ranks
                 info = [{"shape": tensor.shape, "dtype": tensor.dtype} for tensor in media.get(name, [])]
+                infos = list(chain(all_gather(info)))
                 # The entire batch does not contain any media objects of this type.
                 if not infos:
         if images is not None:
             if media is not None:
                 raise ValueError("Both 'media' and 'images' are provided. Please provide only one.")
+            print("The 'images' argument is deprecated. Please use 'media' instead.")
             media = {"image": images}
         if media_config is None:
                     images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
                 media[name] = [image for image in images]
             elif name == "video":
+                if False: #self.config.image_aspect_ratio == "dynamic" and self.config.video_max_tiles > 1:
                     media[name] = [
                         process_images(
                             images,
                         ).half()
                         for images in media[name]
                     ]
+                elif False: #self.config.image_aspect_ratio == "dynamic_s2" and self.config.video_max_tiles > 1:
                     self.config.image_processor = self.vision_tower.image_processor
                     if type(self.config.s2_scales) is str:
                         self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
         if generation_config.eos_token_id is None:
             generation_config.eos_token_id = self.tokenizer.eos_token_id
         return generation_config