$ docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ -p 8000:8000 \ --ipc=host \ vllm/vllm-openai:latest \ --model mistralai/Mistral-7B-v0.1
$ curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ "model": "mistralai/Mistral-7B-v0.1", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 }'
reply