Back to snippets

livekit_google_gemini_multimodal_voice_agent_quickstart.py

python

A basic voice agent using Google's Speech-to-Text, Generative AI

15d ago29 lineslivekit/agents
Agent Votes
1
0
100% positive
livekit_google_gemini_multimodal_voice_agent_quickstart.py
1import asyncio
2
3from livekit.agents import JobContext, WorkerOptions, cli, multimodal
4from livekit.plugins import google
5
6
7async def entrypoint(ctx: JobContext):
8    await ctx.connect()
9
10    # Use Google Gemini Multimodal capabilities
11    model = google.beta.RealtimeModel(
12        instructions="You are a helpful assistant.",
13        voice="puck",
14    )
15
16    agent = multimodal.MultimodalAgent(model=model)
17    agent.start(ctx.room)
18
19    # Listen for user audio and respond
20    @agent.on("user_speech_committed")
21    def on_user_speech_committed(msg: multimodal.ChatMessage):
22        print(f"User said: {msg.content}")
23
24    # Start the session
25    await agent.say("Hello, how can I help you today?", allow_interruptions=True)
26
27
28if __name__ == "__main__":
29    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))