Qwen3量化实践

Pytorch实现Qwen3的AWQ以及GPTQ量化。

AWQ

使用AutoAWQ

但AutoAWQ目前已不再支持,不知道后续新出的模型是否还能用这个方法。

1
2
3
4
5
Important Notice:

* AutoAWQ is officially deprecated and will no longer be maintained.
* The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
* If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# coding=utf-8
# 使用AutoAWQ量化模型

import argparse
import json
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import prompt_utils

parser = argparse.ArgumentParser(description="LLM_CLS")
parser.add_argument("--model_path", type=str, required=True)
parser.add_argument("--quant_path", type=str, required=True)
parser.add_argument("--data_file", type=str, required=True)

args = parser.parse_args()

system_prompt = prompt_utils.system_prompt_v2.system_prompt
label_map = prompt_utils.system_prompt_v2.label_map

quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
model = AutoAWQForCausalLM.from_pretrained(
args.model_path, device_map="auto", use_cache=False
)

# 通过数据校准
data = []
fi = open(args.data_file, "r", encoding="utf-8")
for line in fi:
line_js = json.loads(line)
text = line_js["text"]
label = line_js["label"]
if label not in label_map:
raise ValueError(f"Unknown label: {label}")
label = label_map[label]
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
{"role": "assistant", "content": label},
]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=False, enable_thinking=False
)
data.append(text.strip())
model.quantize(
tokenizer,
quant_config=quant_config,
calib_data=data,
n_parallel_calib_samples=1, # 并行校准样本数
max_calib_samples=256, # 最大校准样本数
max_calib_seq_len=10240, # 最大校准序列长度
)
model.save_quantized(args.quant_path, safetensors=True, shard_size="4GB")
tokenizer.save_pretrained(args.quant_path)
print("Quantization completed.")

GPTQ

使用GPTQModel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# coding=utf-8
# 使用GPTQ量化模型

import argparse
import json
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer
import prompt_utils

parser = argparse.ArgumentParser(description="LLM_CLS")
parser.add_argument("--model_path", type=str, required=True)
parser.add_argument("--quant_path", type=str, required=True)
parser.add_argument("--data_file", type=str, required=True)

args = parser.parse_args()

system_prompt = prompt_utils.system_prompt_v2.system_prompt
label_map = prompt_utils.system_prompt_v2.label_map

quantize_config = QuantizeConfig(bits=4, group_size=128)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
model = GPTQModel.load(args.model_path, quantize_config)

# 通过数据校准
data = []
fi = open(args.data_file, "r", encoding="utf-8")
for line in fi:
line_js = json.loads(line)
text = line_js["text"]
label = line_js["label"]
if label not in label_map:
raise ValueError(f"Unknown label: {label}")
label = label_map[label]
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
{"role": "assistant", "content": label},
]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=False, enable_thinking=False
)
data.append(text.strip())

model.quantize(data[:1024], batch_size=2)
model.save(args.quant_path)
# tokenizer.save_pretrained(args.quant_path)
print("Quantization completed.")