qwen3开发
qwen 文档:https://qwen.readthedocs.io/zh-cn/latest/
示例:https://github.com/QwenLM/Qwen-Agent/tree/main/examples
mcp服务列表:https://github.com/modelcontextprotocol/servers
pip install "qwen-agent[mcp,code_interpreter]"
禁用思考
在提示词或用户谈话末尾加入/no_think会禁止思考
# 通过chatopenai 兼容包传递enable_thinking参数
llm = ChatOpenAI(
model_name="qwen3:32b",
temperature=0.7,
model_kwargs={ # 通过 model_kwargs 转发
"chat_template_kwargs": { # Qwen3 在 vLLM/模板中读取此字段
"enable_thinking": False # 关闭思考模式
}
}
)
示例
from qwen_agent.agents import Assistant
# 定义大语言模型配置
llm_cfg = {
'model': 'qwen3:32b', # 使用的模型名称
# 使用阿里巴巴模型服务平台提供的端点:
# 'model_type': 'qwen_dashscope',
# 'api_key': os.getenv('DASHSCOPE_API_KEY'),
# 使用兼容OpenAI API的自定义端点:
'model_server': 'http://192.168.1.163:11434/v1', # API基础地址
'api_key': 'EMPTY', # API密钥,这里为空表示不需要身份验证
# 其他参数:
# 'generate_cfg': {
# # 添加:当响应内容为 `<think>这是思考过程</think>这是答案` 格式时
# # 不添加:当响应已经被分离为reasoning_content和content时
# 'thought_in_content': True,
# },
}
# 定义工具
tools = [
{'mcpServers': { # 可以指定MCP配置文件
'time': { # 时间工具:用于获取当前时间信息
'command': 'uvx', # 执行命令
'args': ['mcp-server-time', '--local-timezone=Asia/Shanghai'] # 命令参数,这里设置时区为上海
},
"fetch": { # 网页获取工具:用于抓取网页内容
"command": "uvx", # 执行命令
"args": ["mcp-server-fetch"] # 命令参数,用于获取网页内容
}
}
},
'code_interpreter', # 内置工具:代码解释器,用于执行代码
]
# 定义Agent
bot = Assistant(llm=llm_cfg, function_list=tools)
# 流式生成响应
messages = [{'role': 'user', 'content': 'https://qwenlm.github.io/blog/ 介绍qwen的最新发展'}]
for responses in bot.run(messages=messages):
pass
print(responses)
function_call工具调用
functions = [{
'name': 'get_current_weather',
'description': 'Get the current weather in a given location',
'parameters': {
'type': 'object',
'properties': {
'location': {
'type': 'string',
'description': 'The city and state, e.g. San Francisco, CA',
},
'unit': {
'type': 'string',
'enum': ['celsius', 'fahrenheit']
},
},
'required': ['location'],
},
}]
qwen3 vl 模型调优flash-atte加速
https://github.com/Dao-AILab/flash-attention/releases
查看最新的二进制包,对应的python版本,cuda版本,torch版本
例如uv pip install "torch==2.8.0+cu129" "torchvision==0.23.0+cu129" "torchaudio==2.8.0+cu129" --index-url https://download.pytorch.org/whl/cu129
然后github中下载flash-atte二进制包uv pip install ./flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
uv pip install qwen-vl-utils[decord] transformers
代码:
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
import time
# MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"
MODEL_NAME = "/home/ubuntu/ai_model/Qwen3-VL-8B-Instruct"
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = AutoModelForImageTextToText.from_pretrained(
MODEL_NAME,
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": "data_video/WPT_1.mp4",
},
# {"type": "text", "text": "解说这个视频,用中文回答,要有细节,回答不超过100字"},
{"type": "text", "text": "在适当的时候给出或犀利或幽默的像真人的评论。返回json格式,key有timestamp和comment,timestamp是视频的秒数,comment是评论"},
# {"type": "text", "text": "生成详细的一系列事件描述,返回json格式,每个事件的key有start和end和desc,start是事件开始时间,end是事件结束时间,desc是事件描述,每个事件描述不超过100字"},
],
}
]
# 开始时间(字符串+时间戳,用于显示和计算耗时)
start_time_ts = time.time()
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time_ts))
print(f"开始时间: {start_time}")
# Preparation for inference
inputs = processor.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
)
inputs = inputs.to(model.device)
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
# 结束时间(字符串+时间戳,用于显示和计算耗时)
end_time_ts = time.time()
end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time_ts))
print(f"\n结束时间: {end_time}")
elapsed_seconds = end_time_ts - start_time_ts
print(f"总用时: {elapsed_seconds:.2f} 秒")