2025-01-27 17:46:44 +00:00
|
|
|
# Template config, need to change dump_dir, data.root_dir and tokenizer.path
|
|
|
|
# Evals can be activated by uncommenting its config
|
|
|
|
# python -m launchers.stool config=apps/main/configs/debug.yaml nodes=8 account=fair_amaia_cw_codegen qos=lowest
|
|
|
|
|
|
|
|
dump_dir: /tmp/
|
|
|
|
name: "debug"
|
|
|
|
steps: 100_000
|
|
|
|
probe_freq: null
|
|
|
|
seed: 777
|
|
|
|
optim:
|
|
|
|
lr: 4e-04
|
|
|
|
warmup: 500
|
|
|
|
lr_min_ratio: 0.1
|
|
|
|
clip: 10.0
|
|
|
|
|
|
|
|
distributed:
|
|
|
|
fsdp_type: full_shard
|
|
|
|
model_dtype: bf16
|
|
|
|
matmul_allow_tf32: false
|
|
|
|
selective_activation_checkpointing: false
|
|
|
|
tp_size: 1
|
|
|
|
|
|
|
|
train_entropy_model: true
|
|
|
|
model: null
|
|
|
|
entropy_model:
|
|
|
|
dim: 768
|
|
|
|
n_layers: 14
|
|
|
|
n_heads: 12
|
|
|
|
max_seqlen: 8192
|
|
|
|
# vocab_size: -1
|
|
|
|
vocab_size: 260
|
|
|
|
ffn_dim_multiplier: 1.0
|
|
|
|
sliding_window: 512
|
|
|
|
attn_bias_type: "local_block_causal"
|
|
|
|
attn_impl: "xformers"
|
|
|
|
|
|
|
|
data:
|
|
|
|
s3_profile: blt
|
|
|
|
root_dir: ???
|
|
|
|
sources:
|
|
|
|
dclm_baseline_1.0: 1.0
|
|
|
|
batch_size: 2
|
|
|
|
prefetch_size: 64
|
|
|
|
# seqlen is in terms of patches and
|
|
|
|
# max_encoder_seq_length is in terms of bytes.
|
|
|
|
# For entropy model, these are the same since 1 patch=1 byte
|
|
|
|
seq_len: 8192
|
|
|
|
max_encoder_seq_length: 8192
|
|
|
|
load_async: true
|
|
|
|
preprocess_dir: ???
|
|
|
|
# We don't need patches for this model
|
|
|
|
add_patches: false
|
|
|
|
patcher_args:
|
|
|
|
# This doesn't matter since byte entropy model doesn't use patching,
|
|
|
|
# so pick the most efficient, so static
|
|
|
|
patching_mode: byte
|
|
|
|
tokenizer_args:
|
|
|
|
name: bytes
|
|
|
|
|
|
|
|
profiling:
|
|
|
|
run: false
|
|
|
|
|
|
|
|
checkpoint:
|
|
|
|
dump:
|
|
|
|
every: 500
|
|
|
|
keep: 3
|
|
|
|
eval:
|
|
|
|
every: 1000
|
|
|
|
keep: -1
|
|
|
|
|
|
|
|
logging:
|
|
|
|
freq: 10
|
|
|
|
|
|
|
|
eval_on_gpus: 8
|
2025-01-28 00:56:42 +00:00
|
|
|
eval: null
|