# dump_dir: !!!CHANGE_THIS!!! name: large_lm steps: 60_000 probe_freq: null seed: 777 optim: lr: 3e-3 weight_decay: 0.033 warmup: 5000 lr_min_ratio: 0.000001 clip: 1.0 distributed: fsdp_type: full_shard compile: true model_dtype: bf16 matmul_allow_tf32: false selective_activation_checkpointing: false tp_size: 1 model: dim: 2048 n_layers: 25 n_heads: 16 data: root_dir: data/shuffled sources: dclm_baseline_1.0: 100.0 batch_size: 4 prefetch_size: 1024 seq_len: 4096 n_views: 2 load_async: true add_bos: true add_eos: true tokenizer: name: tiktoken path: tokenizers/cl_toplang_128k.tiktoken profiling: run: true mem_warmup: 0 mem_steps: 4 profile_warmup: 100 profile_steps: 4 checkpoint: dump: every: 2500 keep: 3 eval: every: 5000 keep: -1 logging: freq: 1 async_eval_gpus: 8 eval: harness: tasks: - hellaswag - task: boolq dataset_kwargs: trust_remote_code: true - piqa - task: social_iqa dataset_kwargs: trust_remote_code: true - winogrande - openbookqa - arc_easy - arc_challenge - race - commonsense_qa - copa # - coqa # - task: nq_open # num_fewshot: 5 # - triviaqa validation: max_steps: 1000 generator: max_tokens: 16384 dtype: bf16