Create README.md
Browse files
README.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
- de
|
| 5 |
+
- fr
|
| 6 |
+
- it
|
| 7 |
+
- pt
|
| 8 |
+
- hi
|
| 9 |
+
- es
|
| 10 |
+
- th
|
| 11 |
+
library_name: transformers
|
| 12 |
+
pipeline_tag: image-text-to-text
|
| 13 |
+
tags:
|
| 14 |
+
- meta
|
| 15 |
+
- pytorch
|
| 16 |
+
- llama
|
| 17 |
+
- llama-3
|
| 18 |
+
- vision
|
| 19 |
+
base_model:
|
| 20 |
+
- meta-llama/Llama-3.2-11B-Vision-Instruct
|
| 21 |
+
- rombodawg/Llama-3-8B-Instruct-Coder
|
| 22 |
+
# Lumimimaid v0.2 8B + Llama3.2Vision Adapter
|
| 23 |
+
|
| 24 |
+
This model was created using the script below. It is compatible with:
|
| 25 |
+
|
| 26 |
+
* Llama 3.1 8B & 70B
|
| 27 |
+
|
| 28 |
+
Respectively
|
| 29 |
+
|
| 30 |
+
* Llama Vision 3.2 11B & 90B
|
| 31 |
+
|
| 32 |
+
## Merge Script
|
| 33 |
+
|
| 34 |
+
```python
|
| 35 |
+
from transformers import MllamaForConditionalGeneration, MllamaProcessor, AutoModelForCausalLM
|
| 36 |
+
|
| 37 |
+
# NOTE: You need sufficient DRAM to load both models at once (otherwise, need to process layer by layer which is not shown here)
|
| 38 |
+
|
| 39 |
+
multimodal_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Original Llama vision model (11B or 90B)
|
| 40 |
+
text_model_path = "rombodawg/Llama-3-8B-Instruct-Coder" # Model to be merged (8B or 70B)
|
| 41 |
+
save_path = "models/merged_model"
|
| 42 |
+
|
| 43 |
+
multimodal_model = MllamaForConditionalGeneration.from_pretrained(multimodal_model_path, device_map="cpu", torch_dtype=torch.bfloat16)
|
| 44 |
+
multimodal_processor = MllamaProcessor.from_pretrained(multimodal_model_path)
|
| 45 |
+
text_model = AutoModelForCausalLM.from_pretrained(text_model_path, device_map="cpu", torch_dtype=torch.bfloat16)
|
| 46 |
+
|
| 47 |
+
state_dict_multimodal = multimodal_model.state_dict()
|
| 48 |
+
state_dict_text = text_model.state_dict()
|
| 49 |
+
|
| 50 |
+
num_decoder_layers_text = text_model.config.num_hidden_layers
|
| 51 |
+
num_decoder_layers_vision = multimodal_model.config.text_config.num_hidden_layers
|
| 52 |
+
|
| 53 |
+
# Find the list of inserted layers in multimodal Llama
|
| 54 |
+
inserted_layers = set()
|
| 55 |
+
for key_multimodal in state_dict_multimodal.keys():
|
| 56 |
+
if "language_model" in key_multimodal and "cross_attn" in key_multimodal and ".layers." in key_multimodal:
|
| 57 |
+
layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None
|
| 58 |
+
if layer_num_multimodal is not None: inserted_layers.add(layer_num_multimodal)
|
| 59 |
+
# Here are the hard-coded list of layers added:
|
| 60 |
+
# inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 83, 88, 93, 98} $ For 90B
|
| 61 |
+
inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38} # For 11B
|
| 62 |
+
|
| 63 |
+
assert len(inserted_layers) == num_decoder_layers_vision - num_decoder_layers_text, "# of added layers do not match"
|
| 64 |
+
|
| 65 |
+
# Build decoder layer map from multimodal layer# to text layer#, skipping layers listed in inserted_layers
|
| 66 |
+
layer_map = dict()
|
| 67 |
+
layer_num_multimodal = 0
|
| 68 |
+
for layer_num_text in range(num_decoder_layers_text):
|
| 69 |
+
while layer_num_multimodal in inserted_layers: layer_num_multimodal += 1 # Increment to skip mismatched layers
|
| 70 |
+
layer_map[layer_num_multimodal] = layer_num_text
|
| 71 |
+
layer_num_multimodal += 1
|
| 72 |
+
|
| 73 |
+
for key_multimodal in state_dict_multimodal.keys():
|
| 74 |
+
if "language_model" not in key_multimodal: continue # A multi-modal param
|
| 75 |
+
if "cross_attn" in key_multimodal: continue # A multi-modal param
|
| 76 |
+
key_text = key_multimodal.replace("language_model.", "")
|
| 77 |
+
if "embed_tokens.weight" in key_multimodal: # Handle embed tokens separately
|
| 78 |
+
assert key_text in state_dict_text, f"Key not found: {key_text}"
|
| 79 |
+
extra_tokens = state_dict_multimodal[key_multimodal].shape[0] - state_dict_text[key_text].shape[0]
|
| 80 |
+
state_dict_multimodal[key_multimodal][:state_dict_text[key_text].shape[0], :].copy_(state_dict_text[key_text])
|
| 81 |
+
print(f"Replaced {key_multimodal} with {key_text} (preserving last {extra_tokens} tokens)")
|
| 82 |
+
continue
|
| 83 |
+
if "lm_head" in key_multimodal or "model.norm.weight" in key_multimodal: # Handle other non-decoder layers separately
|
| 84 |
+
assert key_text in state_dict_text, f"Key not found: {key_text}"
|
| 85 |
+
state_dict_multimodal[key_multimodal].copy_(state_dict_text[key_text])
|
| 86 |
+
print(f"Replaced {key_multimodal} with {key_text}")
|
| 87 |
+
continue
|
| 88 |
+
layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None
|
| 89 |
+
assert layer_num_multimodal is not None, f"Unknown non-decoder key encountered: {key_multimodal}"
|
| 90 |
+
if layer_num_multimodal in inserted_layers: continue # Skip mismatched layers
|
| 91 |
+
assert layer_num_multimodal in layer_map, f"Layer not found in layer_map: {layer_num_multimodal}"
|
| 92 |
+
layer_num_text = layer_map[layer_num_multimodal]
|
| 93 |
+
key_text = key_text.replace(f".layers.{layer_num_multimodal}.", f".layers.{layer_num_text}.")
|
| 94 |
+
assert key_text in state_dict_text, f"Key not found: {key_text}"
|
| 95 |
+
state_dict_multimodal[key_multimodal].copy_(state_dict_text[key_text])
|
| 96 |
+
print(f"Replaced {key_multimodal} with {key_text}")
|
| 97 |
+
|
| 98 |
+
print("Merged model successfully. Saving...")
|
| 99 |
+
# Apply the changes
|
| 100 |
+
multimodal_model.load_state_dict(state_dict_multimodal)
|
| 101 |
+
|
| 102 |
+
# Create save_path if it does not exist
|
| 103 |
+
os.makedirs(save_path, exist_ok=True)
|
| 104 |
+
multimodal_model.save_pretrained(save_path, safe_serialization=True, max_shard_size="8192MB")
|
| 105 |
+
multimodal_processor.save_pretrained(save_path)
|
| 106 |
+
print(f"Model saved to {save_path}")
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
## Model Inference:
|
| 110 |
+
|
| 111 |
+
```python
|
| 112 |
+
import requests
|
| 113 |
+
import torch
|
| 114 |
+
from PIL import Image
|
| 115 |
+
from transformers import MllamaForConditionalGeneration, AutoProcessor
|
| 116 |
+
|
| 117 |
+
model_id = "rombodawg/Llama-3-8B-Instruct-Coder"
|
| 118 |
+
|
| 119 |
+
model = MllamaForConditionalGeneration.from_pretrained(
|
| 120 |
+
model_id,
|
| 121 |
+
torch_dtype=torch.bfloat16,
|
| 122 |
+
device_map="auto",
|
| 123 |
+
)
|
| 124 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
## License
|
| 128 |
+
|
| 129 |
+
This project is licensed under the MIT License.
|