# Template config, need to change dump_dir, data.root_dir and tokenizer.path # Evals can be activated by uncommenting its config # python -m launchers.stool config=apps/main/configs/debug.yaml nodes=8 account=fair_amaia_cw_codegen qos=lowest dump_dir: /tmp/ name: "debug" steps: 100_000 probe_freq: null seed: 777 optim: lr: 4e-04 warmup: 500 lr_min_ratio: 0.1 clip: 10.0 distributed: fsdp_type: full_shard model_dtype: bf16 matmul_allow_tf32: false selective_activation_checkpointing: false tp_size: 1 model: n_heads: 8 dim: 512 vocab_size: 260 dim_token: 256 patch_size: 6 tokenization_mode: "bytes" patching_mode: "space" tie_local_encoder_decoder_logits: false data_loader_patching: true max_encoder_seq_length: 12288 pad_to_max_length: true patching_threshold: 3.1439168453216553 encoder_hash_byte_group_size: [4] encoder_hash_byte_group_vocab: 50002 encoder_hash_byte_group_nb_functions: 3 encoder_enable_byte_ngrams: false cross_attn_encoder: true # assuming cross_attention is true cross_attn_decoder: true # assuming cross_attention is true cross_attn_window_encoder: 512 cross_attn_window_decoder: 512 dim_local_encoder: 256 dim_local_decoder: 256 cross_attn_k: 8 cross_attn_nheads: 4 cross_attn_all_layers_decoder: true cross_attn_all_layers_encoder: true cross_attn_use_flex_attention: true cross_attn_init_by_pooling: true log_patch_lengths: true non_linearity: "swiglu" use_rope: true recompute_fc1_out: false recompute_fc3_out: false recompute_attn: false custom_bwd: false layer_ckpt: "none" patch_only_encoder: false patch_only_decoder: false use_local_encoder_transformer: true init_use_gaussian: true init_use_depth: "current" attn_bias_type: "block_causal" attn_impl: "xformers" alpha_depth: "disabled" max_length: 256 local_attention_window_len: 512 max_seqlen: 12288 downsampling_by_pooling: "max" data: root_dir: ??? sources: dclm_baseline_1.0: 1.0 batch_size: 2 prefetch_size: 64 seq_len: 4096 load_async: true preprocess_dir: ??? tokenizer_args: name: blt init_kwargs: bpe_tokenizer_path: ??? profiling: run: false checkpoint: dump: every: 500 keep: 3 eval: every: 1000 keep: -1 logging: freq: 10 eval_on_gpus: 8 eval: dataset_dir: /checkpoint/amaia/codegen/datasets/eval tasks: boolq,hellaswag,nq,piqa,siqa,tqa,winogrande,obqa,arc_easy,arc_challenge,race.middle,race.high,gsm8k,math,bbh,copa,human_eval_plus,mbpp,mmlu generator: max_tokens: 65536 dtype: bf16 mp_size: 1