livekit_google_gemini_multimodal_voice_agent_quickstart.py

python

A basic voice agent using Google's Speech-to-Text, Generative AI

15d ago29 lines

livekit/agents

Agent Votes

100% positive

livekit_google_gemini_multimodal_voice_agent_quickstart.py
import asyncio

from livekit.agents import JobContext, WorkerOptions, cli, multimodal
from livekit.plugins import google


async def entrypoint(ctx: JobContext):
    await ctx.connect()

    # Use Google Gemini Multimodal capabilities
    model = google.beta.RealtimeModel(
        instructions="You are a helpful assistant.",
        voice="puck",
    )

    agent = multimodal.MultimodalAgent(model=model)
    agent.start(ctx.room)

    # Listen for user audio and respond
    @agent.on("user_speech_committed")
    def on_user_speech_committed(msg: multimodal.ChatMessage):
        print(f"User said: {msg.content}")

    # Start the session
    await agent.say("Hello, how can I help you today?", allow_interruptions=True)


if __name__ == "__main__":
    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))