mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-29 20:29:48 +00:00
add ci (#1642)
Some checks failed
Book-CI / test (push) Has been cancelled
Book-CI / test-1 (push) Has been cancelled
Book-CI / test-2 (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled
Some checks failed
Book-CI / test (push) Has been cancelled
Book-CI / test-1 (push) Has been cancelled
Book-CI / test-2 (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled
This commit is contained in:
parent
2cffdf7033
commit
51745a9ea1
14 changed files with 845 additions and 48 deletions
|
|
@ -34,42 +34,63 @@ from datasets import load_dataset
|
|||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Quantize MoE models with selective quantization")
|
||||
|
||||
|
||||
# Required arguments
|
||||
parser.add_argument("--model_id", type=str, required=True, help="Path to the input model directory")
|
||||
parser.add_argument("--output_dir", type=str, required=True, help="Path to save the quantized model")
|
||||
|
||||
parser.add_argument(
|
||||
"--model_id",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the input model directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to save the quantized model"
|
||||
)
|
||||
|
||||
# Optional arguments
|
||||
parser.add_argument(
|
||||
"--quant_type",
|
||||
type=str,
|
||||
choices=["W4A16", "W8A16"],
|
||||
default="W8A16",
|
||||
help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16",
|
||||
help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_calibration_samples", type=int, default=512, help="Number of calibration samples. Default: 512"
|
||||
"--num_calibration_samples",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Number of calibration samples. Default: 512"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_sequence_length", type=int, default=2048, help="Maximum sequence length for calibration. Default: 2048"
|
||||
"--max_sequence_length",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="Maximum sequence length for calibration. Default: 2048"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dampening_frac",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="Dampening fraction to mitigate quantization noise. Default: 0.1",
|
||||
help="Dampening fraction to mitigate quantization noise. Default: 0.1"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="HuggingFaceH4/ultrachat_200k",
|
||||
help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k",
|
||||
help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_split", type=str, default="train_sft", help="Dataset split to use. Default: train_sft"
|
||||
"--dataset_split",
|
||||
type=str,
|
||||
default="train_sft",
|
||||
help="Dataset split to use. Default: train_sft"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force_cpu", action="store_true", help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
|
||||
"--force_cpu",
|
||||
action="store_true",
|
||||
help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore_patterns",
|
||||
|
|
@ -82,22 +103,44 @@ def parse_args():
|
|||
r"re:.*\.shared_expert\..*$",
|
||||
r"re:.*\.shared_experts\..*$",
|
||||
r"re:.*\.mlp\.shared_expert_gate$",
|
||||
r"re:.*\.linear_attn\..*$",
|
||||
r"re:.*\.linear_attn\..*$"
|
||||
],
|
||||
help="Regex patterns for layers to ignore during quantization",
|
||||
help="Regex patterns for layers to ignore during quantization"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--torch_dtype",
|
||||
type=str,
|
||||
choices=["bfloat16", "float16", "float32"],
|
||||
default="bfloat16",
|
||||
help="PyTorch dtype for model loading. Default: bfloat16",
|
||||
help="PyTorch dtype for model loading. Default: bfloat16"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust_remote_code", action="store_true", help="Allow loading of remote code (required for some models)"
|
||||
"--trust_remote_code",
|
||||
action="store_true",
|
||||
help="Allow loading of remote code (required for some models)"
|
||||
)
|
||||
parser.add_argument("--random_seed", type=int, default=42, help="Random seed for dataset shuffling. Default: 42")
|
||||
|
||||
parser.add_argument(
|
||||
"--random_seed",
|
||||
type=int,
|
||||
default=42,
|
||||
help="Random seed for dataset shuffling. Default: 42"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_gpu_memory",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Maximum GPU memory for model weights per device (e.g., '40GiB'). "
|
||||
"GPTQ quantization requires additional GPU memory for Hessian matrix computation, "
|
||||
"so reserve 40-50%% of total VRAM. For example, use '40GiB' on 80GB GPUs. "
|
||||
"Remaining layers will be offloaded to CPU. Default: use all available"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_cpu_memory",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Maximum CPU memory to use (e.g., '100GiB'). Default: use all available"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
|
@ -124,7 +167,11 @@ def get_torch_dtype(dtype_str):
|
|||
Returns:
|
||||
torch.dtype: Corresponding PyTorch dtype
|
||||
"""
|
||||
dtype_map = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}
|
||||
dtype_map = {
|
||||
"bfloat16": torch.bfloat16,
|
||||
"float16": torch.float16,
|
||||
"float32": torch.float32
|
||||
}
|
||||
return dtype_map[dtype_str]
|
||||
|
||||
|
||||
|
|
@ -144,18 +191,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote
|
|||
Updated ignore_patterns list with dense layer patterns added
|
||||
"""
|
||||
print("🔍 Checking model configuration for dense layers...")
|
||||
|
||||
|
||||
try:
|
||||
# Load model configuration
|
||||
config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
||||
|
||||
|
||||
# Check if the model has first_k_dense_replace parameter
|
||||
first_k_dense_replace = getattr(config, "first_k_dense_replace", None)
|
||||
|
||||
first_k_dense_replace = getattr(config, 'first_k_dense_replace', None)
|
||||
|
||||
if first_k_dense_replace is not None and first_k_dense_replace > 0:
|
||||
print(f"✅ Found dense layers configuration: first_k_dense_replace = {first_k_dense_replace}")
|
||||
print(f" Adding first {first_k_dense_replace} layers to ignore list...")
|
||||
|
||||
|
||||
# Create regex pattern for dense layers (layers 0 to first_k_dense_replace-1)
|
||||
if first_k_dense_replace == 1:
|
||||
dense_pattern = r"re:model\.layers\.0\.mlp\..*$"
|
||||
|
|
@ -163,18 +210,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote
|
|||
# For multiple layers, use range pattern
|
||||
layer_range = f"[0-{first_k_dense_replace-1}]"
|
||||
dense_pattern = f"re:model\\.layers\\.{layer_range}\\.mlp\\..*$"
|
||||
|
||||
|
||||
# Add the dense layer pattern to ignore list
|
||||
updated_ignore_patterns = ignore_patterns + [dense_pattern]
|
||||
|
||||
|
||||
print(f" Dense layer pattern added: {dense_pattern}")
|
||||
print(f" This will ignore MLP components in layers 0-{first_k_dense_replace-1}")
|
||||
|
||||
|
||||
return updated_ignore_patterns
|
||||
else:
|
||||
print("ℹ️ No dense layers detected (first_k_dense_replace not found or is 0)")
|
||||
return ignore_patterns
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Could not check model config for dense layers: {e}")
|
||||
print(" Proceeding with original ignore patterns...")
|
||||
|
|
@ -214,7 +261,11 @@ def load_and_prepare_dataset(dataset_name, dataset_split, num_samples, max_lengt
|
|||
# Tokenize the data
|
||||
def tokenize(sample):
|
||||
return tokenizer(
|
||||
sample["text"], padding=False, max_length=max_length, truncation=True, add_special_tokens=False
|
||||
sample["text"],
|
||||
padding=False,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
add_special_tokens=False
|
||||
)
|
||||
|
||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||
|
|
@ -255,32 +306,97 @@ def main():
|
|||
# 0) Check for dense layers and update ignore patterns
|
||||
# Dense layers in the first few layers should not be quantized
|
||||
updated_ignore_patterns = check_dense_layers_and_update_ignore(
|
||||
args.model_id, args.ignore_patterns, args.trust_remote_code
|
||||
args.model_id,
|
||||
args.ignore_patterns,
|
||||
args.trust_remote_code
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# 1) Build a dummy model (no weights) to infer a device map
|
||||
# This determines optimal device placement for each module
|
||||
print("🔍 Inferring device map...")
|
||||
with init_empty_weights():
|
||||
dummy = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
device_map = infer_auto_device_map(dummy, no_split_module_classes=dummy._no_split_modules)
|
||||
del dummy
|
||||
|
||||
# Force all modules to CPU for quantization
|
||||
if args.force_cpu:
|
||||
device_map = {name: "cpu" for name in device_map}
|
||||
# In force_cpu mode, directly get module names without calling infer_auto_device_map
|
||||
# to avoid GPU memory allocation
|
||||
print("🔍 Building CPU-only device map...")
|
||||
with init_empty_weights():
|
||||
dummy = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_id,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
device_map = {name: "cpu" for name, _ in dummy.named_modules() if name}
|
||||
del dummy
|
||||
else:
|
||||
print("🔍 Inferring device map...")
|
||||
with init_empty_weights():
|
||||
dummy = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_id,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
# Build max_memory dict if specified
|
||||
max_memory = None
|
||||
if args.max_gpu_memory or args.max_cpu_memory:
|
||||
max_memory = {}
|
||||
if args.max_gpu_memory:
|
||||
# Apply to all available GPUs
|
||||
num_gpus = torch.cuda.device_count()
|
||||
for i in range(num_gpus):
|
||||
max_memory[i] = args.max_gpu_memory
|
||||
print(f" GPU memory limit: {args.max_gpu_memory} per device ({num_gpus} GPUs)")
|
||||
|
||||
# Always set CPU memory when max_memory is used
|
||||
# Otherwise infer_auto_device_map may trigger disk offloading
|
||||
if args.max_cpu_memory:
|
||||
max_memory["cpu"] = args.max_cpu_memory
|
||||
print(f" CPU memory limit: {args.max_cpu_memory}")
|
||||
else:
|
||||
# Use a very large value to allow using all available CPU memory
|
||||
# This prevents disk offloading when user has enough RAM
|
||||
max_memory["cpu"] = "1000GiB"
|
||||
print(f" CPU memory limit: 1000GiB (default, to prevent disk offloading)")
|
||||
|
||||
device_map = infer_auto_device_map(
|
||||
dummy,
|
||||
no_split_module_classes=dummy._no_split_modules,
|
||||
max_memory=max_memory
|
||||
)
|
||||
|
||||
# Check if disk offloading was triggered (not supported by llmcompressor)
|
||||
disk_modules = [k for k, v in device_map.items() if v == "disk"]
|
||||
if disk_modules:
|
||||
print(f"❌ Error: {len(disk_modules)} modules would be offloaded to disk.")
|
||||
print(" llmcompressor does not support disk offloading.")
|
||||
print(" Solutions:")
|
||||
print(" 1. Increase --max_gpu_memory to use more GPU memory")
|
||||
print(" 2. Add --max_cpu_memory with higher value (e.g., '200GiB')")
|
||||
print(" 3. Ensure your machine has enough GPU + CPU memory")
|
||||
raise RuntimeError("Disk offloading is not supported by llmcompressor. "
|
||||
"Please ensure you have enough GPU + CPU memory.")
|
||||
|
||||
del dummy
|
||||
# --------------------------------------------------------------------
|
||||
# 2) Load the full model weights with device mapping
|
||||
# Note: offload_folder=None disables disk offloading (not supported by llmcompressor)
|
||||
print("📥 Loading model...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_id,
|
||||
device_map=device_map,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_id,
|
||||
device_map=device_map,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
offload_folder=None, # Disable disk offloading (not supported by llmcompressor)
|
||||
)
|
||||
except Exception as e:
|
||||
if "disk" in str(e).lower() or "offload" in str(e).lower():
|
||||
print(f"❌ Error: Not enough GPU + CPU memory to load the model.")
|
||||
print(" llmcompressor does not support disk offloading.")
|
||||
print(" Solutions:")
|
||||
print(" 1. Increase --max_gpu_memory to use more GPU memory")
|
||||
print(" 2. Ensure you have enough CPU RAM for remaining layers")
|
||||
print(" 3. Use a machine with more memory")
|
||||
raise
|
||||
raise
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
|
||||
|
||||
|
|
@ -293,7 +409,7 @@ def main():
|
|||
args.num_calibration_samples,
|
||||
args.max_sequence_length,
|
||||
tokenizer,
|
||||
args.random_seed,
|
||||
args.random_seed
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
|
@ -331,4 +447,4 @@ def main():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue