|
|
以下是本地部署文件:
- import os
- os.environ['VLLM_USE_V1'] = '0'
- os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
- os.environ["VLLM_LOGGING_LEVEL"] = "ERROR"
- os.environ['CUDA_VISIBLE_DEVICES'] = "0"
- import torch
- import warnings
- import numpy as np
- warnings.filterwarnings('ignore')
- warnings.filterwarnings('ignore', category=DeprecationWarning)
- warnings.filterwarnings('ignore', category=FutureWarning)
- warnings.filterwarnings('ignore', category=UserWarning)
- from qwen_omni_utils import process_mm_info
- from transformers import Qwen3OmniMoeProcessor
- def _load_model_processor():
- if USE_TRANSFORMERS:
- from transformers import Qwen3OmniMoeForConditionalGeneration
- if TRANSFORMERS_USE_FLASH_ATTN2:
- model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH,
- dtype='auto',
- attn_implementation='flash_attention_2',
- device_map="auto")
- else:
- model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH, device_map="auto", dtype='auto')
- else:
- from vllm import LLM
- model = LLM(
- model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.95,
- tensor_parallel_size=torch.cuda.device_count(),
- limit_mm_per_prompt={'image': 1, 'video': 3, 'audio': 3},
- max_num_seqs=1,
- max_model_len=32768,
- seed=1234,
- )
- processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
- return model, processor
- def run_model(model, processor, messages, return_audio, use_audio_in_video):
- if USE_TRANSFORMERS:
- text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
- audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
- inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=use_audio_in_video)
- inputs = inputs.to(model.device).to(model.dtype)
- text_ids, audio = model.generate(**inputs,
- thinker_return_dict_in_generate=True,
- thinker_max_new_tokens=8192,
- thinker_do_sample=False,
- speaker="Ethan",
- use_audio_in_video=use_audio_in_video,
- return_audio=return_audio)
- response = processor.batch_decode(text_ids.sequences[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
- if audio is not None:
- audio = np.array(audio.reshape(-1).detach().cpu().numpy() * 32767).astype(np.int16)
- return response, audio
- else:
- from vllm import SamplingParams
- sampling_params = SamplingParams(temperature=1e-2, top_p=0.1, top_k=1, max_tokens=8192)
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
- audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
- inputs = {'prompt': text, 'multi_modal_data': {}, "mm_processor_kwargs": {"use_audio_in_video": use_audio_in_video}}
- if images is not None: inputs['multi_modal_data']['image'] = images
- if videos is not None: inputs['multi_modal_data']['video'] = videos
- if audios is not None: inputs['multi_modal_data']['audio'] = audios
- outputs = model.generate(inputs, sampling_params=sampling_params)
- response = outputs[0].outputs[0].text
- return response, None
-
复制代码
部署过程:需要下载的模型文件
是的,你需要下载 Qwen3OmniMoe 模型文件。这个模型应该是阿里云开发的大规模多模态模型。
Windows系统配置步骤
1. 环境准备
安装Python 3.8+版本
安装CUDA驱动(确保支持你的GPU)
安装PyTorch with CUDA支持
2. 安装必要依赖包
- pip install torch torchvision torchaudio
- pip install transformers
- pip install numpy
- pip install qwen_omni_utils # 如果这是特定的工具包
复制代码 对于vLLM支持(可选):
3. 配置环境变量和参数
在Windows上,你需要设置以下几个关键变量:
MODEL_PATH: 指向你下载的模型文件路径
USE_TRANSFORMERS: 设置为 True 使用HuggingFace Transformers,或者 False 使用vLLM
TRANSFORMERS_USE_FLASH_ATTN2: (可选)是否使用Flash Attention 2
4. 修改代码适配单GPU
当前代码已经通过 os.environ['CUDA_VISIBLE_DEVICES'] = "0" 限制只使用第一个GPU,这适合你的单GPU环境。
5. 运行程序
- # 设置必要的全局变量
- MODEL_PATH = "path/to/your/model" # 修改为实际模型路径
- USE_TRANSFORMERS = True # 或 False 使用vLLM
- TRANSFORMERS_USE_FLASH_ATTN2 = False # 根据需要设置
- # 调用模型加载和运行函数
- model, processor = _load_model_processor()
- messages = [{"role": "user", "content": "你的问题"}]
- response, audio = run_model(model, processor, messages, return_audio=False, use_audio_in_video=False)
复制代码 6. 注意事项
确保有足够的磁盘空间存储模型文件(通常几十GB)
单GPU内存可能有限,考虑调整 gpu_memory_utilization 参数
Windows上某些依赖可能需要额外配置或替代方案
|
|