In shell 1: $ docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/...

In shell 1:

  $ docker run --runtime nvidia --gpus all \
      -v ~/.cache/huggingface:/root/.cache/huggingface \
      --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
      -p 8000:8000 \
      --ipc=host \
      vllm/vllm-openai:latest \
      --model mistralai/Mistral-7B-v0.1

In shell 2:

  $ curl http://localhost:8000/v1/completions \
      -H "Content-Type: application/json" \
      -d '{
        "model": "mistralai/Mistral-7B-v0.1",
        "prompt": "San Francisco is a",
        "max_tokens": 7,
        "temperature": 0
      }'