| run: | |
| run_dir: ./runs/instruct_run_24b | |
| seed: 42 | |
| wandb: | |
| enabled: true | |
| project: sft-training | |
| entity: null | |
| name: null | |
| tags: | |
| - sft-lora | |
| - 24b-Devstral | |
| notes: null | |
| model: | |
| repo_id: ./CPT/runs/cpt_run_v1/merged_24b_cpt_lora | |
| revision: null | |
| base_local_dir: base_model | |
| trust_remote_code: true | |
| tokenizer_use_fast: true | |
| device_map: auto | |
| torch_dtype: bfloat16 | |
| use_4bit: false | |
| bnb_4bit_quant_type: nf4 | |
| bnb_4bit_use_double_quant: false | |
| bnb_4bit_compute_dtype: bfloat16 | |
| attn_implementation: null | |
| data: | |
| train_jsonl: ../sft_dataset.jsonl | |
| eval_jsonl: null | |
| eval_split_ratio: 0.1 | |
| instruction_field: instruction | |
| input_field: input | |
| output_field: output | |
| format_type: custom | |
| system_prompt: "You are a Hyperswitch Rust code analyzer. Identify functions/structs\ | |
| \ that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain\ | |
| \ the data flow and why each component must change:\n- Flow: [Input \u2192 Processing\ | |
| \ \u2192 Output with arrows]\n- For each component: \"The [ComponentName] ([path])\ | |
| \ must [action] because [reason]\u2014without this, [consequence]\"\n- Explain\ | |
| \ coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\n\ | |
| add::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n\ | |
| 1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for\ | |
| \ nested items: `status::StructName::Type::Name`\n3. Always explain \"must change\ | |
| \ because\" and \"without this\"\n3. Types of components: function, struct, enum,\ | |
| \ impl, trait\n4. If there is extra information (e.g., enum variants), include\ | |
| \ that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n\ | |
| \n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook\ | |
| \ system routes events via EventClass enum. Flow: webhook \u2192 EventClass \u2192\ | |
| \ handler \u2192 processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass)\ | |
| \ must add Subscriptions variant because it defines event routing\u2014without\ | |
| \ this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus)\ | |
| \ must map to EventType because it converts status to events\u2014without this,\ | |
| \ status changes don't trigger webhooks. These are coupled: EventClass routes\ | |
| \ to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\n\ | |
| crates/common_enums/src/transformers.rs::SubscriptionStatus\n<EOS>\n" | |
| custom_template: '##INSTRUCTION | |
| {instruction}<|im_end|> | |
| ##TASK | |
| {input}<|im_end|> | |
| ##OUTPUT | |
| {output}<|im_end|>' | |
| max_length: 2048 | |
| shuffle: true | |
| num_proc: 4 | |
| peft: | |
| enabled: true | |
| r: 8 | |
| lora_alpha: 16 | |
| lora_dropout: 0.05 | |
| bias: none | |
| target_modules: auto | |
| train: | |
| num_train_epochs: 6 | |
| per_device_train_batch_size: 1 | |
| per_device_eval_batch_size: 1 | |
| gradient_accumulation_steps: 8 | |
| learning_rate: 1e-4 | |
| weight_decay: 0.0 | |
| warmup_ratio: 0.08 | |
| lr_scheduler_type: cosine | |
| optim: adamw_torch | |
| max_grad_norm: 0.8 | |
| gradient_checkpointing: true | |
| logging_steps: 2 | |
| save_strategy: steps | |
| save_steps: 500 | |
| save_total_limit: 20 | |
| evaluation_strategy: steps | |
| eval_steps: 100 | |
| load_best_model_at_end: true | |
| early_stopping: | |
| enabled: true | |
| patience: 3 | |
| min_delta: 0.001 | |
| metric: eval_loss | |
| mode: min | |
| resume_from_checkpoint: auto | |
| merge: | |
| enabled: true | |
| merged_dtype: float16 | |
| max_shard_size: 2GB | |
| output_dir: ./merged_24b_instruct_lora | |