#python -m lingua.stool config=apps/main/configs/llama2_7B.yaml nodes=32 account=fair_amaia_cw_codegen qos=lowest # dump_dir: !!!CHANGE_THIS!!! name: "7b_baseline" steps: 100_000 grad_acc_steps: 1 probe_freq: 100 seed: 777 optim: lr: 1.0e-3 weight_decay: 0.1 warmup: 2000 lr_min_ratio: 0.000001 clip: 1.0 distributed: fsdp_type: full_shard compile: true model_dtype: bf16 matmul_allow_tf32: false selective_activation_checkpointing: false tp_size: 1 model: dim: 4096 n_layers: 32 n_heads: 32 rope_theta: 100_000 ffn_dim_multiplier: 1.0 multiple_of: 256 data: root_dir: data/shuffled sources: dclm_baseline_1.0: 1.0 batch_size: 2 prefetch_size: 1024 seq_len: 4096 n_views: 2 load_async: true tokenizer: name: tiktoken path: tokenizers/cl_toplang_128k.tiktoken profiling: run: true mem_warmup: 0 mem_steps: 4 profile_warmup: 100 profile_steps: 4 checkpoint: dump: every: 10000 keep: -1 eval: every: 1000 keep: 3 logging: freq: 1 async_eval_gpus: 8 eval: dataset_dir: datasets/eval harness: tasks: - hellaswag - task: boolq dataset_kwargs: trust_remote_code: true - piqa - task: social_iqa dataset_kwargs: trust_remote_code: true - winogrande - openbookqa - arc_easy - arc_challenge - race - commonsense_qa # - coqa - copa - mmlu - mmlu_pro # - task: nq_open # num_fewshot: 5 # - triviaqa # - gsm8k # - bbh validation: max_steps: 1000 generator: max_tokens: 8192 dtype: bf16