Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions app/src/main/java/com/kernel/ai/navigation/KernelNavHost.kt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import com.kernel.ai.feature.settings.ScheduledAlarmsScreen
import com.kernel.ai.feature.settings.SettingsScreen
import com.kernel.ai.feature.settings.SidePanelScreen
import com.kernel.ai.feature.settings.UserProfileScreen
import com.kernel.ai.feature.settings.VoiceScreen
import kotlinx.coroutines.launch

private const val ROUTE_LIST = "conversation_list"
Expand All @@ -61,6 +62,7 @@ private const val ROUTE_CHAT = "chat"
private const val ROUTE_SETTINGS = "settings"
private const val ROUTE_USER_PROFILE = "settings/user_profile"
private const val ROUTE_MEMORY = "settings/memory"
private const val ROUTE_VOICE = "settings/voice"
private const val ROUTE_MODEL_SETTINGS = "settings/model_settings"
private const val ROUTE_MODEL_MANAGEMENT = "settings/model_management?scrollTo={scrollTo}"
private const val ARG_SCROLL_TO = "scrollTo"
Expand Down Expand Up @@ -357,6 +359,9 @@ fun KernelNavHost(
onNavigateToMemory = {
navController.navigate(ROUTE_MEMORY)
},
onNavigateToVoice = {
navController.navigate(ROUTE_VOICE)
},
onNavigateToModelSettings = {
navController.navigate(ROUTE_MODEL_SETTINGS)
},
Expand All @@ -382,6 +387,12 @@ fun KernelNavHost(
)
}

composable(ROUTE_VOICE) {
VoiceScreen(
onBack = { navController.popBackStack() },
)
}

composable(ROUTE_MODEL_SETTINGS) {
ModelSettingsScreen(
onBack = { navController.popBackStack() },
Expand Down
1 change: 1 addition & 0 deletions core/voice/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ android {
dependencies {
implementation(libs.core.ktx)
implementation(libs.coroutines.android)
implementation(libs.datastore.preferences)
implementation(libs.vosk.android)

implementation(libs.hilt.android)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package com.kernel.ai.core.voice

import android.media.AudioAttributes
import android.media.AudioFocusRequest
import android.media.AudioManager
import android.content.Context
import android.os.Bundle
import android.speech.tts.TextToSpeech
Expand Down Expand Up @@ -29,6 +32,9 @@ class AndroidTextToSpeechController @Inject constructor(
private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Main.immediate)
private val _events = MutableSharedFlow<VoiceOutputEvent>(extraBufferCapacity = 8)
override val events: Flow<VoiceOutputEvent> = _events.asSharedFlow()
private val audioManager by lazy {
context.getSystemService(Context.AUDIO_SERVICE) as AudioManager
}

@Volatile
private var textToSpeech: TextToSpeech? = null
Expand All @@ -42,6 +48,17 @@ class AndroidTextToSpeechController @Inject constructor(
@Volatile
private var activeUtteranceText: String? = null

@Volatile
private var audioFocusRequest: AudioFocusRequest? = null

override suspend fun warmUp(): VoiceOutputResult {
return if (ensureReady() != null) {
VoiceOutputResult.Spoken
} else {
VoiceOutputResult.Unavailable("Text-to-speech is unavailable on this device.")
}
}

override suspend fun speak(request: VoiceSpeakRequest): VoiceOutputResult {
val engine = ensureReady() ?: return VoiceOutputResult.Unavailable(
"Text-to-speech is unavailable on this device."
Expand All @@ -56,6 +73,7 @@ class AndroidTextToSpeechController @Inject constructor(
}

engine.language = locale
requestAudioFocus()
val utteranceId = request.utteranceId ?: "kernel-voice-${System.nanoTime()}"
activeUtteranceId = utteranceId
activeUtteranceText = request.text
Expand All @@ -68,6 +86,7 @@ class AndroidTextToSpeechController @Inject constructor(
return if (result == TextToSpeech.ERROR) {
activeUtteranceId = null
activeUtteranceText = null
releaseAudioFocus()
VoiceOutputResult.Unavailable("Text-to-speech failed to start.")
} else {
VoiceOutputResult.Spoken
Expand All @@ -80,6 +99,7 @@ class AndroidTextToSpeechController @Inject constructor(
textToSpeech?.stop()
activeUtteranceId = null
activeUtteranceText = null
releaseAudioFocus()
if (hadActiveUtterance) {
_events.emit(VoiceOutputEvent.SpeakingStopped)
}
Expand All @@ -100,6 +120,12 @@ class AndroidTextToSpeechController @Inject constructor(
val engine = TextToSpeech(context) { status ->
if (!deferred.isCompleted) deferred.complete(status)
}
engine.setAudioAttributes(
AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_ASSISTANT)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build()
)
engine.setOnUtteranceProgressListener(
object : UtteranceProgressListener() {
override fun onStart(utteranceId: String?) {
Expand Down Expand Up @@ -149,6 +175,31 @@ class AndroidTextToSpeechController @Inject constructor(
if (utteranceId != activeUtteranceId) return
activeUtteranceId = null
activeUtteranceText = null
releaseAudioFocus()
_events.tryEmit(VoiceOutputEvent.SpeakingStopped)
}

private fun requestAudioFocus() {
val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK)
.setAudioAttributes(
AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_ASSISTANT)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build()
)
.setAcceptsDelayedFocusGain(false)
.setWillPauseWhenDucked(false)
.setOnAudioFocusChangeListener { }
.build()
audioFocusRequest = request
val result = audioManager.requestAudioFocus(request)
if (result != AudioManager.AUDIOFOCUS_REQUEST_GRANTED) {
Log.w(TAG, "TextToSpeech audio focus request not granted: $result")
}
}

private fun releaseAudioFocus() {
audioFocusRequest?.let { audioManager.abandonAudioFocusRequest(it) }
audioFocusRequest = null
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ sealed interface VoiceOutputEvent {
interface VoiceOutputController {
val events: Flow<VoiceOutputEvent> get() = emptyFlow()

suspend fun warmUp(): VoiceOutputResult = VoiceOutputResult.Spoken

suspend fun speak(request: VoiceSpeakRequest): VoiceOutputResult

fun stop()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package com.kernel.ai.core.voice

import android.content.Context
import android.util.Log
import androidx.datastore.preferences.core.booleanPreferencesKey
import androidx.datastore.preferences.core.edit
import androidx.datastore.preferences.core.emptyPreferences
import androidx.datastore.preferences.preferencesDataStore
import dagger.hilt.android.qualifiers.ApplicationContext
import java.io.IOException
import javax.inject.Inject
import javax.inject.Singleton
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.flow.map

private const val TAG = "VoiceOutputPrefs"
private val Context.voiceOutputPrefsDataStore by preferencesDataStore(name = "voice_output_preferences")

@Singleton
class VoiceOutputPreferences @Inject constructor(
@ApplicationContext private val context: Context,
) {
private val spokenResponsesEnabledKey =
booleanPreferencesKey("quick_actions_spoken_responses_enabled")

val spokenResponsesEnabled: Flow<Boolean> = context.voiceOutputPrefsDataStore.data
.catch { e ->
if (e is IOException) {
Log.e(TAG, "Failed reading voice output preferences; using defaults", e)
emit(emptyPreferences())
} else {
throw e
}
}
.map { prefs -> prefs[spokenResponsesEnabledKey] ?: true }

suspend fun setSpokenResponsesEnabled(enabled: Boolean) {
context.voiceOutputPrefsDataStore.edit { prefs ->
prefs[spokenResponsesEnabledKey] = enabled
}
}
}
4 changes: 2 additions & 2 deletions docs/ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,8 @@ Lower-priority skill additions — third-party integrations and local utilities.

| Sub-Issue | Title | Status | Priority |
|-----------|-------|--------|----------|
| [#671](https://github.com/NickMonrad/kernel-ai-assistant/issues/671) | Offline push-to-talk voice input foundation | ⬜ Pending | 🟡 Medium |
| [#672](https://github.com/NickMonrad/kernel-ai-assistant/issues/672) | Generic spoken response / TTS foundation | ⬜ Pending | 🟡 Medium |
| [#671](https://github.com/NickMonrad/kernel-ai-assistant/issues/671) | Offline push-to-talk voice input foundation | ✅ Done — PR #711 | 🟡 Medium |
| [#672](https://github.com/NickMonrad/kernel-ai-assistant/issues/672) | Generic spoken response / TTS foundation | ✅ Done — PR #711 | 🟡 Medium |
| [#678](https://github.com/NickMonrad/kernel-ai-assistant/issues/678) | Optional native Android STT engine alongside Vosk | ⬜ Pending | 🟡 Medium |
| [#700](https://github.com/NickMonrad/kernel-ai-assistant/issues/700) | Parakeet CTC STT evaluation | ⬜ Pending | 🟡 Medium |
| [#703](https://github.com/NickMonrad/kernel-ai-assistant/issues/703) | Whisper.cpp vs Vosk + staged vision follow-up | ⬜ Pending | 🟡 Medium |
Expand Down
8 changes: 4 additions & 4 deletions docs/SPECIFICATION.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Technical Specification: Jandal AI — Local-First Android AI Assistant

> **Last updated:** 2026-04-22 (docs refresh for #523 / PR #682; prompt/tool-routing docs updated)
> **Last updated:** 2026-05-02 (voice foundations and roadmap summary refresh)
>
> This is the authoritative technical specification for Jandal AI. For feature status and
> delivery timeline, see [`ROADMAP.md`](./ROADMAP.md).
Expand Down Expand Up @@ -575,7 +575,7 @@ Community-extensible skills run sandboxed via **Chicory** (pure JVM Wasm runtime
- **Navigation:** Bottom nav bar — Chats tab (conversations list) + Actions tab (quick commands)
- **Chat:** Streaming token display, thinking mode indicator, markdown rendering, multi-conversation
- **Actions tab:** History list, FAB (⚡) for new commands, bottom sheet input, Room-persisted history
- **Voice:** Tap-to-toggle with auto-stop on silence (future: "Hey Jandal" wake word)
- **Voice:** Quick Actions push-to-talk with offline STT and spoken QIR responses; chat voice and wake word remain future work
- **Skill results:** Inline rich cards in the conversation stream, with expandable list previews and link surfacing for fallback/plain-text results
- **Persona:** Friendly, concise, dry-humoured Kiwi — see §7 for full identity details

Expand Down Expand Up @@ -805,9 +805,9 @@ for the larger planned coverage matrix.
|-------|-------------|--------|
| 1 | Core LiteRT-LM chat + GPU/NPU + GPU alignment fixes + OOM protection | ✅ Complete |
| 2 | sqlite-vec RAG + EmbeddingGemma + episodic distillation + memory UI | ✅ Complete |
| 3 | Resident Agent Architecture: QIR + native SDK tool calling, rich tool results, weather/list/date/media skills, and broader multi-turn support | 🔄 In Progress |
| 3 | Resident Agent Architecture: QIR + native SDK tool calling, rich tool results, voice foundations, weather/list/date/media skills, and broader multi-turn support | 🔄 In Progress |
| 4 | Dreaming Engine (overnight distillation) + Semantic Cache + Self-Healing Identity | ⬜ Planned |
| 5 | Chicory Wasm Runtime + GitHub Skill Store | ⬜ Planned |
| 6 | 8GB device optimisation (dynamic weight loading, E2B auto-select) + wake word | ⬜ Planned |
| 6 | 8GB device optimisation (dynamic weight loading, E2B auto-select) | ⬜ Planned |

See [`ROADMAP.md`](./ROADMAP.md) for full task-level detail.
14 changes: 14 additions & 0 deletions docs/phase-3f-voice-foundations-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ Baseline target:
- transcript flows through the same routing path as typed input
- empty/silence/error states are visible and recoverable

Current branch status:

- Quick Actions already ships with a push-to-talk mic entry point, microphone permission flow, and offline Vosk-backed transcription
- transcripts route back through the existing `ActionsViewModel` Quick Actions path
- the current manual device pass was sufficient to treat `#671` as complete on merge alongside `#672`

#### 4. Implement spoken responses

Primary issue: `#672`
Expand All @@ -81,6 +87,14 @@ Baseline target:
- slot-fill prompts can be spoken
- interruption / stop behavior is predictable
- typed Quick Actions remain silent by default
- spoken-response controls live under **Settings -> Voice** so future STT/TTS options can expand in one place

Current branch status:

- `AndroidTextToSpeechController` has warm-up support and explicit assistant-style audio attributes / transient audio focus handling
- Quick Actions / QIR spoken responses are controlled by a shared `VoiceOutputPreferences` preference
- the user-facing spoken-response toggle has been moved out of About into a dedicated **Settings -> Voice** screen
- the new Voice screen is the intended future home for additional STT/TTS settings, engine choices, and voice model controls

#### 5. Connect the voice session loop

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import com.kernel.ai.core.voice.VoiceInputEvent
import com.kernel.ai.core.voice.VoiceInputStartResult
import com.kernel.ai.core.voice.VoiceOutputController
import com.kernel.ai.core.voice.VoiceOutputEvent
import com.kernel.ai.core.voice.VoiceOutputPreferences
import com.kernel.ai.core.voice.VoiceOutputResult
import com.kernel.ai.core.voice.VoiceSpeakRequest
import dagger.hilt.android.lifecycle.HiltViewModel
Expand Down Expand Up @@ -57,6 +58,7 @@ class ActionsViewModel @Inject constructor(
private val quickActionDao: QuickActionDao,
private val voiceInputController: VoiceInputController,
private val voiceOutputController: VoiceOutputController,
private val voiceOutputPreferences: VoiceOutputPreferences,
) : ViewModel() {

// ── Action history ──────────────────────────────────────────────────────
Expand Down Expand Up @@ -129,8 +131,19 @@ class ActionsViewModel @Inject constructor(
val voicePlaybackState: StateFlow<VoicePlaybackState> = _voicePlaybackState.asStateFlow()
private var shouldAutoStartVoiceSlotReply = false
private var pendingPhonePermissionAction: PendingPhonePermissionAction? = null
private var spokenResponsesEnabled = true

init {
viewModelScope.launch {
voiceOutputPreferences.spokenResponsesEnabled.collect { enabled ->
spokenResponsesEnabled = enabled
if (enabled) {
voiceOutputController.warmUp()
} else {
voiceOutputController.stop()
}
}
}
viewModelScope.launch {
voiceInputController.events.collect { event ->
when (event) {
Expand Down Expand Up @@ -558,6 +571,7 @@ class ActionsViewModel @Inject constructor(

private fun speakForVoice(inputMode: InputMode, text: String) {
if (inputMode != InputMode.Voice) return
if (!spokenResponsesEnabled) return
val summary = toSpokenSummary(text)
if (summary.isBlank()) return
viewModelScope.launch {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import com.kernel.ai.core.voice.VoiceInputEvent
import com.kernel.ai.core.voice.VoiceInputStartResult
import com.kernel.ai.core.voice.VoiceOutputController
import com.kernel.ai.core.voice.VoiceOutputEvent
import com.kernel.ai.core.voice.VoiceOutputPreferences
import com.kernel.ai.core.voice.VoiceOutputResult
import com.kernel.ai.core.voice.VoiceSpeakRequest
import io.mockk.Runs
Expand All @@ -28,6 +29,7 @@ import io.mockk.unmockkStatic
import io.mockk.verify
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.ExperimentalCoroutinesApi
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.MutableSharedFlow
import kotlinx.coroutines.flow.emptyFlow
import kotlinx.coroutines.flow.flowOf
Expand Down Expand Up @@ -55,8 +57,10 @@ class ActionsViewModelVoiceTest {
private val quickActionDao: QuickActionDao = mockk()
private val voiceInputController: VoiceInputController = mockk()
private val voiceOutputController: VoiceOutputController = mockk()
private val voiceOutputPreferences: VoiceOutputPreferences = mockk()
private val voiceInputEvents = MutableSharedFlow<VoiceInputEvent>()
private val voiceOutputEvents = MutableSharedFlow<VoiceOutputEvent>()
private val spokenResponsesEnabled = MutableStateFlow(true)

private lateinit var viewModel: ActionsViewModel

Expand All @@ -71,15 +75,18 @@ class ActionsViewModelVoiceTest {
coEvery { quickActionDao.insert(any()) } just Runs
every { voiceInputController.events } returns voiceInputEvents
every { voiceInputController.stopListening() } just Runs
coEvery { voiceOutputController.warmUp() } returns VoiceOutputResult.Spoken
coEvery { voiceOutputController.speak(any()) } returns VoiceOutputResult.Spoken
every { voiceOutputController.events } returns voiceOutputEvents
every { voiceOutputController.stop() } just Runs
every { voiceOutputPreferences.spokenResponsesEnabled } returns spokenResponsesEnabled
viewModel = ActionsViewModel(
quickIntentRouter = quickIntentRouter,
skillRegistry = skillRegistry,
quickActionDao = quickActionDao,
voiceInputController = voiceInputController,
voiceOutputController = voiceOutputController,
voiceOutputPreferences = voiceOutputPreferences,
)
}

Expand Down Expand Up @@ -259,6 +266,29 @@ class ActionsViewModelVoiceTest {
}
}

@Test
fun `voice mode does not speak when spoken responses disabled`() = runTest(dispatcher) {
val directSkill = mockk<Skill>()
spokenResponsesEnabled.value = false
every { quickIntentRouter.route("turn on flashlight") } returns
QuickIntentRouter.RouteResult.RegexMatch(
QuickIntentRouter.MatchedIntent(
intentName = "toggle_flashlight_on",
params = emptyMap(),
),
)
every { skillRegistry.get("toggle_flashlight_on") } returns directSkill
every { directSkill.name } returns "toggle_flashlight_on"
every { directSkill.description } returns "Toggle flashlight"
every { directSkill.schema } returns SkillSchema()
coEvery { directSkill.execute(any()) } returns SkillResult.Success("Flashlight on")

viewModel.executeAction("turn on flashlight", InputMode.Voice)
advanceUntilIdle()

coVerify(exactly = 0) { voiceOutputController.speak(any()) }
}

@Test
fun `make call permission flow emits event and retries after grant`() = runTest(dispatcher) {
val runIntentSkill = mockk<Skill>()
Expand Down
Loading
Loading