diff --git a/conversion/gemma.py b/conversion/gemma.py index 1b427a30c..76beedcf0 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -786,14 +786,15 @@ class Gemma4VisionAudioModel(MmprojModel): super().set_gguf_parameters() # vision params + assert self.hparams_vision is not None self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) # audio params - if self.hparams_audio: - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + assert self.hparams_audio is not None + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6)) def is_audio_tensor(self, name: str) -> bool: return "audio_tower" in name or "embed_audio" in name diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 5fd583d40..bbcae7609 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1552,6 +1552,9 @@ struct clip_model_loader { hparams.audio_n_fft = 512; hparams.audio_window_len = 320; // 20ms frame (NOT 25ms/400) hparams.audio_hop_len = 160; + // due to a mistake in the original conversion code, rms_norm_eps is set to a wrong value + // since all gemma4a models use 1e-6, we just hardcode it here to avoid re-conversion + hparams.eps = 1e-6f; } break; case PROJECTOR_TYPE_GRANITE_SPEECH: {