[ PROMPT_NODE_22470 ]

部署

[ SKILL_DOCUMENTATION ]

# 生产环境部署指南在生产环境中部署 SGLang 的完整指南。 ## 服务器部署 ### 基础服务器 bash python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000 --mem-fraction-static 0.9 ### 多 GPU (张量并行) bash # 在 4 个 GPU 上运行 Llama 3-70B python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-70B-Instruct --tp 4 --port 30000 ### 量化 bash # FP8 量化 (H100) python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-70B-Instruct --quantization fp8 --tp 4 # INT4 AWQ 量化 python -m sglang.launch_server --model-path TheBloke/Llama-2-70B-AWQ --quantization awq --tp 2 # INT4 GPTQ 量化 python -m sglang.launch_server --model-path TheBloke/Llama-2-70B-GPTQ --quantization gptq --tp 2 ## Docker 部署 ### Dockerfile dockerfile FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 # 安装 Python RUN apt-get update && apt-get install -y python3.10 python3-pip git # 安装 SGLang RUN pip3 install "sglang[all]" flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ # 复制模型 (或在运行时下载) WORKDIR /app # 暴露端口 EXPOSE 30000 # 启动服务器 CMD ["python3", "-m", "sglang.launch_server", "--model-path", "meta-llama/Meta-Llama-3-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] ### 构建与运行 bash # 构建镜像 docker build -t sglang:latest . # 使用 GPU 运行 docker run --gpus all -p 30000:30000 sglang:latest # 使用指定 GPU 运行 docker run --gpus '"device=0,1,2,3"' -p 30000:30000 sglang:latest # 使用自定义模型运行 docker run --gpus all -p 30000:30000 -e MODEL_PATH="meta-llama/Meta-Llama-3-70B-Instruct" -e TP_SIZE="4" sglang:latest ## Kubernetes 部署 ### 部署 YAML yaml apiVersion: apps/v1 kind: Deployment metadata: name: sglang-llama3-70b spec: replicas: 2 selector: matchLabels: app: sglang template: metadata: labels: app: sglang spec: containers: - name: sglang image: sglang:latest command: - python3 - -m - sglang.launch_server - --model-path=meta-llama/Meta-Llama-3-70B-Instruct - --tp=4 - --host=0.0.0.0 - --port=30000 - --mem-fraction-static=0.9

数据来源：claude-code-templates（MIT），中文翻译由 AI 生成。详见关于我们。

BAGUA AI