qwen3开发

qwen 文档：https://qwen.readthedocs.io/zh-cn/latest/
示例：https://github.com/QwenLM/Qwen-Agent/tree/main/examples
mcp服务列表：https://github.com/modelcontextprotocol/servers

pip install "qwen-agent[mcp,code_interpreter]"

禁用思考

在提示词或用户谈话末尾加入/no_think会禁止思考

# 通过chatopenai 兼容包传递enable_thinking参数
llm = ChatOpenAI(
    model_name="qwen3:32b",
    temperature=0.7,
    model_kwargs={                           # 通过 model_kwargs 转发
        "chat_template_kwargs": {            # Qwen3 在 vLLM/模板中读取此字段
            "enable_thinking": False         # 关闭思考模式
        }
    }
)

示例

from qwen_agent.agents import Assistant

# 定义大语言模型配置
llm_cfg = {
    'model': 'qwen3:32b',  # 使用的模型名称

    # 使用阿里巴巴模型服务平台提供的端点:
    # 'model_type': 'qwen_dashscope',
    # 'api_key': os.getenv('DASHSCOPE_API_KEY'),

    # 使用兼容OpenAI API的自定义端点:
    'model_server': 'http://192.168.1.163:11434/v1',  # API基础地址
    'api_key': 'EMPTY',  # API密钥，这里为空表示不需要身份验证

    # 其他参数:
    # 'generate_cfg': {
    #         # 添加：当响应内容为 `<think>这是思考过程</think>这是答案` 格式时
    #         # 不添加：当响应已经被分离为reasoning_content和content时
    #         'thought_in_content': True,
    #     },
}

# 定义工具
tools = [
    {'mcpServers': {  # 可以指定MCP配置文件
            'time': {  # 时间工具：用于获取当前时间信息
                'command': 'uvx',  # 执行命令
                'args': ['mcp-server-time', '--local-timezone=Asia/Shanghai']  # 命令参数，这里设置时区为上海
            },
            "fetch": {  # 网页获取工具：用于抓取网页内容
                "command": "uvx",  # 执行命令
                "args": ["mcp-server-fetch"]  # 命令参数，用于获取网页内容
            }
        }
    },
  'code_interpreter',  # 内置工具：代码解释器，用于执行代码
]

# 定义Agent
bot = Assistant(llm=llm_cfg, function_list=tools)

# 流式生成响应
messages = [{'role': 'user', 'content': 'https://qwenlm.github.io/blog/ 介绍qwen的最新发展'}]

for responses in bot.run(messages=messages):
    pass
print(responses)

function_call工具调用

functions = [{
        'name': 'get_current_weather',
        'description': 'Get the current weather in a given location',
        'parameters': {
            'type': 'object',
            'properties': {
                'location': {
                    'type': 'string',
                    'description': 'The city and state, e.g. San Francisco, CA',
                },
                'unit': {
                    'type': 'string',
                    'enum': ['celsius', 'fahrenheit']
                },
            },
            'required': ['location'],
        },
    }]

qwen3 vl 模型调优flash-atte加速

https://github.com/Dao-AILab/flash-attention/releases
查看最新的二进制包，对应的python版本，cuda版本，torch版本

例如
uv pip install "torch==2.8.0+cu129" "torchvision==0.23.0+cu129" "torchaudio==2.8.0+cu129" --index-url https://download.pytorch.org/whl/cu129

然后github中下载flash-atte二进制包
uv pip install ./flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp312-cp312-linux_x86_64.whl

uv pip install qwen-vl-utils[decord] transformers

代码：

import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
import time

# MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"
MODEL_NAME = "/home/ubuntu/ai_model/Qwen3-VL-8B-Instruct"


# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_NAME,
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

processor = AutoProcessor.from_pretrained(MODEL_NAME)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "data_video/WPT_1.mp4",
            },
            # {"type": "text", "text": "解说这个视频，用中文回答，要有细节，回答不超过100字"},
            {"type": "text", "text": "在适当的时候给出或犀利或幽默的像真人的评论。返回json格式，key有timestamp和comment，timestamp是视频的秒数，comment是评论"},
            # {"type": "text", "text": "生成详细的一系列事件描述，返回json格式，每个事件的key有start和end和desc，start是事件开始时间，end是事件结束时间，desc是事件描述，每个事件描述不超过100字"},
        ],
    }
]

# 开始时间（字符串+时间戳，用于显示和计算耗时）
start_time_ts = time.time()
start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time_ts))
print(f"开始时间: {start_time}")

# Preparation for inference
inputs = processor.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


# 结束时间（字符串+时间戳，用于显示和计算耗时）
end_time_ts = time.time()
end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time_ts))
print(f"\n结束时间: {end_time}")
elapsed_seconds = end_time_ts - start_time_ts
print(f"总用时: {elapsed_seconds:.2f} 秒")