# dump_dir: !!!CHANGE_THIS!!!
name: large_lm
steps: 60_000
probe_freq: null
seed: 777

optim:
  lr: 3e-3
  weight_decay: 0.033
  warmup: 5000
  lr_min_ratio: 0.000001
  clip: 1.0

distributed:
  fsdp_type: full_shard
  compile: true
  model_dtype: bf16
  matmul_allow_tf32: false
  selective_activation_checkpointing: false
  tp_size: 1

model:
  dim: 2048
  n_layers: 25
  n_heads: 16

data:
  root_dir: data/shuffled
  sources:
    dclm_baseline_1.0: 100.0
  batch_size: 4
  prefetch_size: 1024
  seq_len: 4096
  n_views: 2
  load_async: true
  add_bos: true
  add_eos: true
  tokenizer:
    name: tiktoken
    path: tokenizers/cl_toplang_128k.tiktoken

profiling:
  run: true
  mem_warmup: 0
  mem_steps: 4
  profile_warmup: 100
  profile_steps: 4

checkpoint:
  dump:
    every: 2500
    keep: 3
  eval:
    every: 5000
    keep: -1

logging:
  freq: 1

async_eval_gpus: 8
eval:
  harness:
    tasks:
      - hellaswag
      - task: boolq
        dataset_kwargs:
          trust_remote_code: true
      - piqa
      - task: social_iqa
        dataset_kwargs:
          trust_remote_code: true
      - winogrande
      - openbookqa
      - arc_easy
      - arc_challenge
      - race
      - commonsense_qa
      - copa
      # - coqa
      # - task: nq_open
      #   num_fewshot: 5
      # - triviaqa
  validation:
    max_steps: 1000
  generator:
    max_tokens: 16384
    dtype: bf16