blt/apps/main/configs/llama_7B.yaml

#python -m lingua.stool config=apps/main/configs/llama2_7B.yaml nodes=32 account=fair_amaia_cw_codegen qos=lowest
# dump_dir: !!!CHANGE_THIS!!!
name: "7b_baseline"
steps: 100_000
grad_acc_steps: 1
probe_freq: 100

seed: 777
optim:
  lr: 1.0e-3
  weight_decay: 0.1
  warmup: 2000
  lr_min_ratio: 0.000001
  clip: 1.0

distributed:
  fsdp_type: full_shard
  compile: true
  model_dtype: bf16
  matmul_allow_tf32: false
  selective_activation_checkpointing: false
  tp_size: 1

model:
  dim: 4096
  n_layers: 32
  n_heads: 32
  rope_theta: 100_000
  ffn_dim_multiplier: 1.0
  multiple_of: 256

data:
  root_dir: data/shuffled
  sources:
    dclm_baseline_1.0: 1.0
  batch_size: 2
  prefetch_size: 1024
  seq_len: 4096
  n_views: 2
  load_async: true
  tokenizer:
    name: tiktoken
    path: tokenizers/cl_toplang_128k.tiktoken

profiling:
  run: true
  mem_warmup: 0
  mem_steps: 4
  profile_warmup: 100
  profile_steps: 4

checkpoint:
  dump:
    every: 10000
    keep: -1
  eval:
    every: 1000
    keep: 3

logging:
  freq: 1

async_eval_gpus: 8
eval:
  dataset_dir: datasets/eval
  harness:
    tasks:
      - hellaswag
      - task: boolq
        dataset_kwargs:
          trust_remote_code: true
      - piqa
      - task: social_iqa
        dataset_kwargs:
          trust_remote_code: true
      - winogrande
      - openbookqa
      - arc_easy
      - arc_challenge
      - race
      - commonsense_qa
      # - coqa
      - copa
      - mmlu
      - mmlu_pro
      # - task: nq_open
      #   num_fewshot: 5
      # - triviaqa
      # - gsm8k
      # - bbh
  validation:
    max_steps: 1000
  generator:
    max_tokens: 8192
    dtype: bf16
Initial commit 2024-12-12 23:32:30 +00:00			`#python -m lingua.stool config=apps/main/configs/llama2_7B.yaml nodes=32 account=fair_amaia_cw_codegen qos=lowest`
			`# dump_dir: !!!CHANGE_THIS!!!`
			`name: "7b_baseline"`
			`steps: 100_000`
			`grad_acc_steps: 1`
			`probe_freq: 100`

			`seed: 777`
			`optim:`
			`lr: 1.0e-3`
			`weight_decay: 0.1`
			`warmup: 2000`
			`lr_min_ratio: 0.000001`
			`clip: 1.0`

			`distributed:`
			`fsdp_type: full_shard`
			`compile: true`
			`model_dtype: bf16`
			`matmul_allow_tf32: false`
			`selective_activation_checkpointing: false`
			`tp_size: 1`

			`model:`
			`dim: 4096`
			`n_layers: 32`
			`n_heads: 32`
			`rope_theta: 100_000`
			`ffn_dim_multiplier: 1.0`
			`multiple_of: 256`

			`data:`
			`root_dir: data/shuffled`
			`sources:`
			`dclm_baseline_1.0: 1.0`
			`batch_size: 2`
			`prefetch_size: 1024`
			`seq_len: 4096`
			`n_views: 2`
			`load_async: true`
			`tokenizer:`
			`name: tiktoken`
			`path: tokenizers/cl_toplang_128k.tiktoken`

			`profiling:`
			`run: true`
			`mem_warmup: 0`
			`mem_steps: 4`
			`profile_warmup: 100`
			`profile_steps: 4`

			`checkpoint:`
			`dump:`
			`every: 10000`
			`keep: -1`
			`eval:`
			`every: 1000`
			`keep: 3`

			`logging:`
			`freq: 1`

			`async_eval_gpus: 8`
			`eval:`
			`dataset_dir: datasets/eval`
			`harness:`
			`tasks:`
			`- hellaswag`
			`- task: boolq`
			`dataset_kwargs:`
			`trust_remote_code: true`
			`- piqa`
			`- task: social_iqa`
			`dataset_kwargs:`
			`trust_remote_code: true`
			`- winogrande`
			`- openbookqa`
			`- arc_easy`
			`- arc_challenge`
			`- race`
			`- commonsense_qa`
			`# - coqa`
			`- copa`
			`- mmlu`
			`- mmlu_pro`
			`# - task: nq_open`
			`# num_fewshot: 5`
			`# - triviaqa`
			`# - gsm8k`
			`# - bbh`
			`validation:`
			`max_steps: 1000`
			`generator:`
			`max_tokens: 8192`
			`dtype: bf16`