AI Agent with iOS - Quick Start
VideoSDK empowers you to integrate an AI voice agent into your iOS app within minutes. The agent joins the same meeting room and interacts over voice using the Google Gemini Live API.
Prerequisites
- iOS 13.0+
- Xcode 13.0+
- Swift 5.0+
- VideoSDK Developer Account (get token from the dashboard)
- Python 3.12+
- Google API Key with Gemini Live API access
You need a VideoSDK account to generate a token and a Google API key for the Gemini Live API. Visit the VideoSDK dashboard to generate a token and the Google AI Studio for Google API key.
Project Structure
.
├── videosdk-agents-quickstart-ios/
│ ├── JoinScreenView.swift
│ ├── MeetingView.swift
│ ├── MeetingViewController.swift
│ ├── RoomsStruct.swift
│ └── videosdk_agents_quickstart_iosApp.swift
├── videosdk-agents-quickstart-ios.xcodeproj/
├── agent-ios.py
└── .env
You will work on:
JoinScreenView.swift
: Join screen UIMeetingView.swift
: Meeting interface with audio controlsMeetingViewController.swift
: Handles meeting logic and eventsagent-ios.py
: Python AI agent (Gemini Live).env
: For storing API keys.
1. iOS Frontend
Step 1: Create App and Install VideoSDK
Create a new iOS app in Xcode:
- Create a new Xcode project
- Choose App template
- Add Product Name and save the project
Install VideoSDK using Swift Package Manager:
- In Xcode, go to
File > Add Packages...
- Enter the repository URL:
https://github.com/videosdk-live/videosdk-rtc-ios-sdk.git
- Choose the latest version and click
Add Package
Add permissions to Info.plist
:
<key>NSCameraUsageDescription</key>
<string>Camera permission description</string>
<key>NSMicrophoneUsageDescription</key>
<string>Microphone permission description</string>
Step 2: Create Models and Views
Create Swift models and views for the meeting interface:
struct RoomsStruct: Codable {
let createdAt, updatedAt, roomID: String?
let links: Links?
let id: String?
enum CodingKeys: String, CodingKey {
case createdAt, updatedAt
case roomID = "roomId"
case links, id
}
}
struct Links: Codable {
let getRoom, getSession: String?
enum CodingKeys: String, CodingKey {
case getRoom = "get_room"
case getSession = "get_session"
}
}
import SwiftUI
struct JoinScreenView: View {
// State variables for
let meetingId: String = "YOUR_MEETING_ID"
@State var name: String
var body: some View {
NavigationView {
VStack {
Text("VideoSDK")
.font(.largeTitle)
.fontWeight(.bold)
Text("AI Agent Quickstart")
.font(.largeTitle)
.fontWeight(.semibold)
.padding(.bottom)
TextField("Enter Your Name", text: $name)
.foregroundColor(Color.black)
.autocorrectionDisabled()
.font(.headline)
.overlay(
Image(systemName: "xmark.circle.fill")
.padding()
.offset(x: 10)
.foregroundColor(Color.gray)
.opacity(name.isEmpty ? 0.0 : 1.0)
.onTapGesture {
UIApplication.shared.endEditing()
name = ""
}
, alignment: .trailing)
.padding()
.background(
RoundedRectangle(cornerRadius: 25)
.fill(Color.secondary.opacity(0.5))
.shadow(color: Color.gray.opacity(0.10), radius: 10))
.padding()
NavigationLink(destination: MeetingView(meetingId: self.meetingId, userName: name ?? "Guest")
.navigationBarBackButtonHidden(true)) {
Text("Join Meeting")
.foregroundColor(Color.white)
.padding()
.background(
RoundedRectangle(cornerRadius: 25.0)
.fill(Color.blue))
}
}
}
}
}
extension UIApplication {
func endEditing() {
sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
}
}
import SwiftUI
import VideoSDKRTC
struct MeetingView: View{
@Environment(\.presentationMode) var presentationMode
@ObservedObject var meetingViewController = MeetingViewController()
@State var meetingId: String?
@State var userName: String?
@State var isUnMute: Bool = true
var body: some View {
VStack {
if meetingViewController.participants.count == 0 {
Text("Meeting Initializing")
} else {
VStack {
VStack(spacing: 20) {
Text("Meeting ID: \(meetingViewController.meetingID)")
.padding(.vertical)
List {
ForEach(meetingViewController.participants.indices, id: \.self) { index in
Text("Participant Name: \(meetingViewController.participants[index].displayName)")
}
}
}
VStack {
HStack(spacing: 15) {
// mic button
Button {
if isUnMute {
isUnMute = false
meetingViewController.meeting?.muteMic()
}
else {
isUnMute = true
meetingViewController.meeting?.unmuteMic()
}
} label: {
Text("Toggle Mic")
.foregroundStyle(Color.white)
.font(.caption)
.padding()
.background(
RoundedRectangle(cornerRadius: 25)
.fill(Color.blue))
}
// end meeting button
Button {
meetingViewController.meeting?.end()
presentationMode.wrappedValue.dismiss()
} label: {
Text("End Call")
.foregroundStyle(Color.white)
.font(.caption)
.padding()
.background(
RoundedRectangle(cornerRadius: 25)
.fill(Color.red))
}
}
.padding(.bottom)
}
}
}
}
.onAppear() {
/// MARK :- configuring the videoSDK
VideoSDK.config(token: meetingViewController.token)
print(meetingId)
if meetingId?.isEmpty == false {
print("i ff meeting isd is emty \(meetingId)")
// join an existing meeting with provided meeting Id
meetingViewController.joinMeeting(meetingId: meetingId!, userName: userName!)
}
}
}
}
Step 3: Implement Meeting Logic
Create the main meeting view controller to handle events:
import Foundation
import VideoSDKRTC
class MeetingViewController: ObservableObject {
var token = "YOUR_VIDEOSDK_AUTH_TOKEN" // Add Your token here
var meetingId: String = ""
var name: String = ""
@Published var meeting: Meeting? = nil
@Published var participants: [Participant] = []
@Published var meetingID: String = ""
func initializeMeeting(meetingId: String, userName: String) {
meeting = VideoSDK.initMeeting(
meetingId: meetingId,
participantName: userName,
micEnabled: true,
webcamEnabled: false
)
meeting?.addEventListener(self)
meeting?.join()
}
}
extension MeetingViewController: MeetingEventListener {
func onMeetingJoined() {
guard let localParticipant = self.meeting?.localParticipant else { return }
// add to list
participants.append(localParticipant)
localParticipant.addEventListener(self)
}
func onParticipantJoined(_ participant: Participant) {
participants.append(participant)
// add listener
participant.addEventListener(self)
}
func onParticipantLeft(_ participant: Participant) {
participants = participants.filter({ $0.id != participant.id })
}
func onMeetingLeft() {
meeting?.localParticipant.removeEventListener(self)
meeting?.removeEventListener(self)
}
func onMeetingStateChanged(meetingState: MeetingState) {
switch meetingState {
case .DISCONNECTED:
participants.removeAll()
default:
print("")
}
}
}
extension MeetingViewController: ParticipantEventListener {
}
extension MeetingViewController {
func joinMeeting(meetingId: String, userName: String) {
if !token.isEmpty {
self.meetingID = meetingId
self.initializeMeeting(meetingId: meetingId, userName: userName)
}
else {
print("Auth token required")
}
}
}
Step 4: App Entry Point
Configure the main app entry point:
import SwiftUI
@main
struct videosdk_agents_quickstart_iosApp: App {
var body: some Scene {
WindowGroup {
JoinScreenView(name: "")
}
}
}
2. Python AI Agent
Step 1: Configure Environment
Create a .env
file in the mobile-quickstarts/ios
directory to store your API keys securely.
# Google API Key for Gemini Live API
GOOGLE_API_KEY=your_google_api_key_here
# VideoSDK Authentication Token
VIDEOSDK_AUTH_TOKEN=your_videosdk_auth_token_here
2. Create Python AI Agent
Create the Python AI agent that will join the same meeting room and interact with users through voice.
from videosdk.agents import Agent, AgentSession, RealTimePipeline, JobContext, RoomOptions, WorkerJob
from videosdk.plugins.google import GeminiRealtime, GeminiLiveConfig
import logging
logging.getLogger().setLevel(logging.INFO)
class MyVoiceAgent(Agent):
def __init__(self):
super().__init__(
instructions="You are a high-energy game-show host guiding the caller to guess a secret number from 1 to 100 to win 1,000,000$.",
)
async def on_enter(self) -> None:
await self.session.say("Welcome to the Videosdk's AI Agent game show! I'm your host, and we're about to play for 1,000,000$. Are you ready to play?")
async def on_exit(self) -> None:
await self.session.say("Goodbye!")
async def start_session(context: JobContext):
agent = MyVoiceAgent()
model = GeminiRealtime(
model="gemini-2.0-flash-live-001",
# When GOOGLE_API_KEY is set in .env - DON'T pass api_key parameter
# api_key="AIXXXXXXXXXXXXXXXXXXXX",
config=GeminiLiveConfig(
voice="Leda", # Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr.
response_modalities=["AUDIO"]
)
)
pipeline = RealTimePipeline(model=model)
session = AgentSession(
agent=agent,
pipeline=pipeline
)
def on_transcription(data: dict):
role = data.get("role")
text = data.get("text")
print(f"[TRANSCRIPT][{role}]: {text}")
pipeline.on("realtime_model_transcription", on_transcription)
await context.run_until_shutdown(session=session, wait_for_participant=True)
def make_context() -> JobContext:
room_options = RoomOptions(
# Static meeting ID - same as used in frontend
room_id="YOUR_MEETING_ID", # Replace it with your actual room_id
name="Gemini Agent",
playground=True,
)
return JobContext(room_options=room_options)
if __name__ == "__main__":
job = WorkerJob(entrypoint=start_session, jobctx=make_context)
job.start()
3. Run the Application
Step 1: Run the iOS App
Build and run the app from Xcode on a simulator or physical device.
Step 2: Run the AI Agent
Open a new terminal and run the Python agent:
pip install videosdk-agents
pip install "videosdk-plugins-google"
python agent-ios.py
Step 3: Connect and Interact
- Run the iOS app on a simulator or device
- Join the meeting and allow microphone permissions
- When you join, the Python agent detects your participation and starts speaking
- Talk to the agent in real time and play the number guessing game
Troubleshooting
- Ensure the same
room_id
is set in both the iOS app and the agent'sRoomOptions
- Verify microphone permissions in iOS Settings > Privacy & Security > Microphone
- Confirm your VideoSDK token is valid and Google API key is set
- For simulator issues, ensure you're using a physical device for microphone testing
Next Steps
Clone repo for quick implementation
Got a Question? Ask us on discord