blt/bytelatent/configs/entropy_model.yaml
Pedro Rodriguez 7044771a12
Some checks failed
Lint with Black / lint (push) Has been cancelled
Lint with isort / lint (push) Has been cancelled
This includes fixes that make checkpointing and reloading work correctly. (#35)
It also batches in a first set of changes for fixing eval code

Summary:

Test Plan:
2025-01-27 16:56:42 -08:00

76 lines
1.5 KiB
YAML

# Template config, need to change dump_dir, data.root_dir and tokenizer.path
# Evals can be activated by uncommenting its config
# python -m launchers.stool config=apps/main/configs/debug.yaml nodes=8 account=fair_amaia_cw_codegen qos=lowest
dump_dir: /tmp/
name: "debug"
steps: 100_000
probe_freq: null
seed: 777
optim:
lr: 4e-04
warmup: 500
lr_min_ratio: 0.1
clip: 10.0
distributed:
fsdp_type: full_shard
model_dtype: bf16
matmul_allow_tf32: false
selective_activation_checkpointing: false
tp_size: 1
train_entropy_model: true
model: null
entropy_model:
dim: 768
n_layers: 14
n_heads: 12
max_seqlen: 8192
# vocab_size: -1
vocab_size: 260
ffn_dim_multiplier: 1.0
sliding_window: 512
attn_bias_type: "local_block_causal"
attn_impl: "xformers"
data:
s3_profile: blt
root_dir: ???
sources:
dclm_baseline_1.0: 1.0
batch_size: 2
prefetch_size: 64
# seqlen is in terms of patches and
# max_encoder_seq_length is in terms of bytes.
# For entropy model, these are the same since 1 patch=1 byte
seq_len: 8192
max_encoder_seq_length: 8192
load_async: true
preprocess_dir: ???
# We don't need patches for this model
add_patches: false
patcher_args:
# This doesn't matter since byte entropy model doesn't use patching,
# so pick the most efficient, so static
patching_mode: byte
tokenizer_args:
name: bytes
profiling:
run: false
checkpoint:
dump:
every: 500
keep: 3
eval:
every: 1000
keep: -1
logging:
freq: 10
eval_on_gpus: 8
eval: null