Update modeling_deepseekocr.py
Browse files- modeling_deepseekocr.py +3 -3
modeling_deepseekocr.py
CHANGED
|
@@ -1067,9 +1067,9 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
|
|
| 1067 |
if isinstance(layer.mlp, DeepseekV2MoE):
|
| 1068 |
moe_layer = layer.mlp
|
| 1069 |
# combine experts
|
| 1070 |
-
moe_layer.w1 = nn.Parameter(torch.stack([moe_layer.experts[i].gate_proj.weight.T for i in range(moe_layer.config.n_routed_experts)])
|
| 1071 |
-
moe_layer.w2 = nn.Parameter(torch.stack([moe_layer.experts[i].down_proj.weight.T for i in range(moe_layer.config.n_routed_experts)])
|
| 1072 |
-
moe_layer.w3 = nn.Parameter(torch.stack([moe_layer.experts[i].up_proj.weight.T for i in range(moe_layer.config.n_routed_experts)])
|
| 1073 |
del moe_layer.experts
|
| 1074 |
gc.collect()
|
| 1075 |
moe_layer.experts = None
|
|
|
|
| 1067 |
if isinstance(layer.mlp, DeepseekV2MoE):
|
| 1068 |
moe_layer = layer.mlp
|
| 1069 |
# combine experts
|
| 1070 |
+
moe_layer.w1 = nn.Parameter(torch.stack([moe_layer.experts[i].gate_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]))
|
| 1071 |
+
moe_layer.w2 = nn.Parameter(torch.stack([moe_layer.experts[i].down_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]))
|
| 1072 |
+
moe_layer.w3 = nn.Parameter(torch.stack([moe_layer.experts[i].up_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]))
|
| 1073 |
del moe_layer.experts
|
| 1074 |
gc.collect()
|
| 1075 |
moe_layer.experts = None
|