mirror of
https://github.com/facebookresearch/blt.git
synced 2025-01-31 10:02:15 +00:00
83 lines
1.6 KiB
YAML
83 lines
1.6 KiB
YAML
|
# Template config, need to change dump_dir, data.root_dir and tokenizer.path
|
||
|
# Evals can be activated by uncommenting its config
|
||
|
# python -m launchers.stool config=apps/main/configs/debug.yaml nodes=8 account=fair_amaia_cw_codegen qos=lowest
|
||
|
|
||
|
dump_dir: /tmp/
|
||
|
name: "debug"
|
||
|
steps: 100_000
|
||
|
probe_freq: null
|
||
|
seed: 777
|
||
|
optim:
|
||
|
lr: 4e-04
|
||
|
warmup: 500
|
||
|
lr_min_ratio: 0.1
|
||
|
clip: 10.0
|
||
|
|
||
|
distributed:
|
||
|
fsdp_type: full_shard
|
||
|
model_dtype: bf16
|
||
|
matmul_allow_tf32: false
|
||
|
selective_activation_checkpointing: false
|
||
|
tp_size: 1
|
||
|
|
||
|
train_entropy_model: true
|
||
|
model: null
|
||
|
entropy_model:
|
||
|
dim: 768
|
||
|
n_layers: 14
|
||
|
n_heads: 12
|
||
|
max_seqlen: 8192
|
||
|
# vocab_size: -1
|
||
|
vocab_size: 260
|
||
|
ffn_dim_multiplier: 1.0
|
||
|
sliding_window: 512
|
||
|
attn_bias_type: "local_block_causal"
|
||
|
attn_impl: "xformers"
|
||
|
|
||
|
data:
|
||
|
s3_profile: blt
|
||
|
root_dir: ???
|
||
|
sources:
|
||
|
dclm_baseline_1.0: 1.0
|
||
|
batch_size: 2
|
||
|
prefetch_size: 64
|
||
|
# seqlen is in terms of patches and
|
||
|
# max_encoder_seq_length is in terms of bytes.
|
||
|
# For entropy model, these are the same since 1 patch=1 byte
|
||
|
seq_len: 8192
|
||
|
max_encoder_seq_length: 8192
|
||
|
load_async: true
|
||
|
preprocess_dir: ???
|
||
|
# We don't need patches for this model
|
||
|
add_patches: false
|
||
|
patcher_args:
|
||
|
# This doesn't matter since byte entropy model doesn't use patching,
|
||
|
# so pick the most efficient, so static
|
||
|
patching_mode: byte
|
||
|
tokenizer_args:
|
||
|
name: bytes
|
||
|
|
||
|
profiling:
|
||
|
run: false
|
||
|
|
||
|
checkpoint:
|
||
|
dump:
|
||
|
every: 500
|
||
|
keep: 3
|
||
|
eval:
|
||
|
every: 1000
|
||
|
keep: -1
|
||
|
|
||
|
logging:
|
||
|
freq: 10
|
||
|
|
||
|
eval_on_gpus: 8
|
||
|
eval:
|
||
|
dataset_dir: ???
|
||
|
tasks: ???
|
||
|
generator:
|
||
|
max_tokens: 65536
|
||
|
dtype: bf16
|
||
|
|
||
|
mp_size: 1
|