AI Agent with IoT - Quick Start
VideoSDK empowers you to seamlessly integrate AI agents with real-time voice interaction with your IoT device within minutes.
In this quickstart, you'll explore how to create an AI agent that connects with an IoT device and interacts with users through voice using Google Gemini Live API.
Prerequisites
Before you begin, ensure you have the following:
- ESP-IDF v5.3: Installed and configured for your ESP32-S3 board.
- Python: Version 3.12 or higher.
- VideoSDK Account: If you don't have one, sign up at the VideoSDK Dashboard.
- Google API Key: For using the Gemini Live API.
You need a VideoSDK account to generate a token and a Google API key for the Gemini Live API. Visit the VideoSDK dashboard to generate a token and the Google AI Studio for Google API key.
Project Structure
IoT-quickstart/
├── main/
│ ├── ai-demo.c
│ ├── CMakeLists.txt
│ ├── idf_component.yml
│ └── Kconfig.projbuild
├── agent-iot.py
├── partitions.csv
├── sdkconfig.defaults
└── README.md
You will be working with the following files:
main/ai-demo.c
: Main application logic for the ESP32 firmware.agent-iot.py
: The Python AI agent that joins the meeting.- Configuration Files:
main/idf_component.yml
,main/CMakeLists.txt
,main/Kconfig.projbuild
,partitions.csv
, andsdkconfig.defaults
for project setup.
1. ESP32-S3 Firmware Setup
Step 1: Create a Meeting Room
First, create a meeting room using the VideoSDK API. This will provide a static roomId
that both the ESP32 device and the AI agent will use to connect.
curl -X POST https://api.videosdk.live/v2/rooms \
-H "Authorization: YOUR_JWT_TOKEN_HERE" \
-H "Content-Type: application/json"
Replace YOUR_JWT_TOKEN_HERE
with your VideoSDK auth token. Copy the roomId
from the response to use in the following steps.
Step 2: Configure the Project
Update the configuration files to set up your project dependencies, build settings, and hardware specifics.
- Dependencies
- CMake
- Hardware Config
- Partitions
- SDK Config
## IDF Component Manager Manifest File
dependencies:
iot-sdk:
path: /path/to/your/IoTSdk # Replace with the absolute path to your cloned IoTSdk
protocol_examples_common:
path: ${IDF_PATH}/examples/common_components/protocol_examples_common
idf:
version: =5.3.0
mdns: '*'
espressif/esp_audio_codec: ~2.3.0
espressif/esp_codec_dev: ~1.3.4
espressif/esp_audio_effects: ~1.1.0
sepfy/srtp: ^2.3.0
idf_component_register(SRCS
"ai-demo.c"
INCLUDE_DIRS "."
REQUIRES mbedtls REQUIRES json REQUIRES esp_netif REQUIRES fatfs REQUIRES vfs REQUIRES esp_common REQUIRES esp_timer REQUIRES esp_lcd REQUIRES nvs_flash REQUIRES bt
)
target_compile_options(${COMPONENT_LIB} PRIVATE "-Wno-format")
menu "SET Microcontroller"
choice AUDIO_BOARD
prompt "Audio hardware board"
default ESP32S3_XIAO
help
Select an audio board to use
config ESP32_S3_KORVO_2_V3_0_BOARD
bool "ESP32-S3-Korvo-2"
depends on IDF_TARGET_ESP32S3
config ESP32S3_XIAO
bool "ESP32-S3-XIAO"
endchoice
endmenu
# ESP-IDF Partition Table
# Name, Type, SubType, Offset, Size, Flags
nvs, data, nvs, 0x9000, 0x6000,
phy_init, data, phy, 0xf000, 0x1000,
factory, app, factory, 0x10000, 4M,
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.2.2 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32s3"
CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
CONFIG_ESPTOOLPY_FLASHSIZE_4MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_ESP32S3_XIAO_SENSE=y
CONFIG_EXAMPLE_WIFI_SSID="myssid"
CONFIG_EXAMPLE_WIFI_PASSWORD="mypassword"
CONFIG_EXAMPLE_CONNECT_IPV6=n
CONFIG_ESP_PHY_REDUCE_TX_POWER=y
CONFIG_SPIRAM=y
CONFIG_SPIRAM_MODE_OCT=y
CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
CONFIG_ESP_SYSTEM_EVENT_TASK_STACK_SIZE=2048
CONFIG_ESP_MAIN_TASK_STACK_SIZE=4096
CONFIG_ESP_TASK_WDT_CHECK_IDLE_TASK_CPU1=n
CONFIG_ESP_IPC_TASK_STACK_SIZE=2048
CONFIG_ESP_WIFI_DYNAMIC_RX_BUFFER_NUM=16
CONFIG_ESP_WIFI_STATIC_TX_BUFFER_NUM=32
CONFIG_ESP_WIFI_CACHE_TX_BUFFER_NUM=64
CONFIG_LWIP_IPV6_AUTOCONFIG=y
CONFIG_LWIP_IPV6_DHCP6=y
CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744
CONFIG_LWIP_TCP_WND_DEFAULT=5744
CONFIG_MBEDTLS_EXTERNAL_MEM_ALLOC=y
CONFIG_MBEDTLS_SSL_PROTO_DTLS=y
CONFIG_PTHREAD_TASK_STACK_SIZE_DEFAULT=8192
Step 3: Implement the Firmware Logic
Update main/ai-demo.c
with your VideoSDK token and the roomId
you created.
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/param.h>
#include <sys/time.h>
#include "esp_event.h"
#include "esp_log.h"
#include "esp_mac.h"
#include "esp_netif.h"
#include "esp_partition.h"
#include "esp_system.h"
#include "freertos/FreeRTOS.h"
#include "nvs_flash.h"
#include "protocol_examples_common.h"
#include "videosdk.h"
static const char *TAG = "Videosdk";
const char *token = "YOUR_VIDEOSDK_AUTH_TOKEN"; // Replace with your VideoSDK auth token
static void meeting_task(void *pvParameters)
{
create_meeting_result_t result = create_meeting(token);
if (result.room_id)
{
ESP_LOGI(TAG, "Created meeting roomId = %s", result.room_id);
free(result.room_id);
}
else
{
ESP_LOGE(TAG, "Failed to create meeting");
}
ESP_LOGI(TAG, "meeting_task finished, deleting self");
vTaskDelete(NULL);
}
void app_main(void)
{
static char deviceid[32] = {0};
uint8_t mac[8] = {0};
esp_log_level_set("*", ESP_LOG_INFO);
esp_log_level_set("esp-tls", ESP_LOG_VERBOSE);
esp_log_level_set("MQTT_CLIENT", ESP_LOG_VERBOSE);
esp_log_level_set("MQTT_EXAMPLE", ESP_LOG_VERBOSE);
esp_log_level_set("TRANSPORT_BASE", ESP_LOG_VERBOSE);
esp_log_level_set("TRANSPORT", ESP_LOG_VERBOSE);
esp_log_level_set("OUTBOX", ESP_LOG_VERBOSE);
ESP_ERROR_CHECK(nvs_flash_init());
ESP_ERROR_CHECK(esp_netif_init());
ESP_ERROR_CHECK(esp_event_loop_create_default());
ESP_ERROR_CHECK(example_connect());
BaseType_t ok = xTaskCreate(meeting_task, "meeting_task", 16384, (void *)token, 5, NULL);
if (ok != pdPASS)
{
ESP_LOGE(TAG, "Failed to create meeting_task");
}
init_config_t init_cfg = {
.meetingID = "YOUR_MEETING_ID", // Replace with your meeting ID
.token = token,
.displayName = "ESP32-Device",
.audioCodec = AUDIO_CODEC_OPUS,
};
result_t init_result = init(&init_cfg);
printf("Result: %d\n", init_result);
result_t result_publish = startPublishAudio("");
result_t result_susbcribe = startSubscribeAudio("", NULL);
printf("Result:%d\n", result_publish);
while (1)
{
vTaskDelay(pdMS_TO_TICKS(10));
}
}
2. Python AI Agent
Step 1: Configure Environment and Credentials
Create a .env
file in the IoT-quickstart
directory to store your API keys securely.
# Google API Key for Gemini Live API
GOOGLE_API_KEY="your_google_api_key_here"
# VideoSDK Authentication Token
VIDEOSDK_AUTH_TOKEN="your_videosdk_auth_token_here"
Step 2: Create the Python AI Agent
The Python agent joins the same meeting room and uses the Gemini Live API to interact with the user. Update agent-iot.py
with the roomId
you created earlier.
from videosdk.agents import Agent, AgentSession, RealTimePipeline,JobContext, RoomOptions, WorkerJob
from videosdk.plugins.google import GeminiRealtime, GeminiLiveConfig
import logging
logging.getLogger().setLevel(logging.INFO)
class MyVoiceAgent(Agent):
def __init__(self):
super().__init__(
instructions="You are a high-energy game-show host guiding the caller to guess a secret number from 1 to 100 to win 1,000,000$.",
)
async def on_enter(self) -> None:
await self.session.say("Welcome to the Videosdk's AI Agent game show! I'm your host, and we're about to play for 1,000,000$. Are you ready to play?")
async def on_exit(self) -> None:
await self.session.say("Goodbye!")
async def start_session(context: JobContext):
agent = MyVoiceAgent()
model = GeminiRealtime(
model="gemini-2.0-flash-live-001",
# When GOOGLE_API_KEY is set in .env - DON'T pass api_key parameter
# api_key="AIXXXXXXXXXXXXXXXXXXXX",
config=GeminiLiveConfig(
voice="Leda", # Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr.
response_modalities=["AUDIO"]
)
)
pipeline = RealTimePipeline(model=model)
session = AgentSession(
agent=agent,
pipeline=pipeline
)
def on_transcription(data: dict):
role = data.get("role")
text = data.get("text")
print(f"[TRANSCRIPT][{role}]: {text}")
pipeline.on("realtime_model_transcription", on_transcription)
await context.run_until_shutdown(session=session,wait_for_participant=True)
def make_context() -> JobContext:
room_options = RoomOptions(
# Static meeting ID - same as used in IoT
room_id="YOUR_MEETING_ID", # Replace it with your actual room_id
name="Gemini Agent",
playground=True,
)
return JobContext(room_options=room_options)
if __name__ == "__main__":
job = WorkerJob(entrypoint=start_session, jobctx=make_context)
job.start()
3. Run the Application
Step 1: Run the ESP32 Firmware
Configure, build, and flash the firmware onto your ESP32 board.
-
Set the target board:
idf.py set-target esp32s3
-
Run menuconfig to set WiFi and other board settings:
idf.py menuconfig
Inside
menuconfig
, navigate to:Component config
->mbedtls
-> EnableSupport DTLS
andSupport TLS
.Example Connection Configuration
-> Set yourWIFI SSID
andWIFI Password
.Partition table
-> EnableCustom partition table CSV
.Serial flasher config
-> Adjust the flash size for your board.Set Microcontroller
-> Select your audio hardware board.
-
Build and flash the project:
idf.py build
idf.py flash monitor
Step 2: Run the Python AI Agent
Open a new terminal, navigate to the IoT-quickstart
directory, and run the Python agent.
# Install Python dependencies
pip install videosdk-agents "videosdk-plugins-google"
# Run the AI agent
python agent-iot.py
Once the ESP32 device joins the meeting, the AI agent will detect it and begin the interactive game show.
Next Steps
Clone repo for quick implementation
Got a Question? Ask us on discord