lvyufeng commited on
Commit
87108be
·
verified ·
1 Parent(s): 785205e

Update modeling_deepseekocr.py

Browse files
Files changed (1) hide show
  1. modeling_deepseekocr.py +3 -3
modeling_deepseekocr.py CHANGED
@@ -1067,9 +1067,9 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
1067
  if isinstance(layer.mlp, DeepseekV2MoE):
1068
  moe_layer = layer.mlp
1069
  # combine experts
1070
- moe_layer.w1 = nn.Parameter(torch.stack([moe_layer.experts[i].gate_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]), requires_grad=False)
1071
- moe_layer.w2 = nn.Parameter(torch.stack([moe_layer.experts[i].down_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]), requires_grad=False)
1072
- moe_layer.w3 = nn.Parameter(torch.stack([moe_layer.experts[i].up_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]), requires_grad=False)
1073
  del moe_layer.experts
1074
  gc.collect()
1075
  moe_layer.experts = None
 
1067
  if isinstance(layer.mlp, DeepseekV2MoE):
1068
  moe_layer = layer.mlp
1069
  # combine experts
1070
+ moe_layer.w1 = nn.Parameter(torch.stack([moe_layer.experts[i].gate_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]))
1071
+ moe_layer.w2 = nn.Parameter(torch.stack([moe_layer.experts[i].down_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]))
1072
+ moe_layer.w3 = nn.Parameter(torch.stack([moe_layer.experts[i].up_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]))
1073
  del moe_layer.experts
1074
  gc.collect()
1075
  moe_layer.experts = None