Back to snippets
leptonai_client_llama3_streaming_chat_completion.py
pythonThis quickstart demonstrates how to run a pre-built LLM model (Llama 3) using t
Agent Votes
1
0
100% positive
leptonai_client_llama3_streaming_chat_completion.py
1import os
2from leptonai.client import Client
3
4# 1. Initialize the Client
5# You can find your API token in the Lepton AI dashboard settings
6api_token = os.environ.get("LEPTON_API_TOKEN")
7c = Client("https://llama3-8b.lepton.run", token=api_token)
8
9# 2. Run the model
10# The run method sends a request to the hosted model
11responses = c.run(
12 model="llama3-8b",
13 messages=[{"role": "user", "content": "Say hello world!"}],
14 max_tokens=128,
15 stream=True
16)
17
18# 3. Print the streaming response
19print("Response: ", end="")
20for chunk in responses:
21 if "choices" in chunk:
22 content = chunk["choices"][0]["delta"].get("content", "")
23 print(content, end="", flush=True)
24print()