ChatScreen 마운트 시 백그라운드 native init 으로 첫 send 시점에 native load 지연을 안 보이게 한다. 12개 AC + UX-Reviewer 의 6개 권고 모두 코드 반영. 핵심 변경: - `chat_warmup_provider.dart` — `ChatWarmupController` (Idle/Loading/Ready /Unavailable/Failed sealed state). fast path (`llm.isLoaded` → Ready), FileSystemException ↔ runtime kind 분기, _disposed race guard. - `model_lifecycle.dart` — `quickCheck()`: 2.4GB SHA-256 hashing 없이 meta_kv + 파일 존재만 보고 ready 추정 (R4 UX 권고). - `gemma_llm_service.dart` + `llm_service.dart` — `_loadingFuture` 동시 호출 가드. 두 caller 가 동시에 load() 해도 native init 은 1 회만. - `chat_screen.dart` — initState postFrameCallback 에서 warmup.start(). warmup 상태에 따라 hintText / spinner / 실패 banner 분기. AC coverage (12개): - AC1~AC8: ChatWarmupController unit (chat_warmup_test.dart 8 tests). - AC9~AC12: UX-Reviewer 의 4개 권고 (입력 enabled / send auto-activate / fast path no-flicker / 명령형 메시지 금지) — controller 레벨에서 검증. 테스트: 167 passed (1 pre-existing skip). `flutter analyze` clean. Refs #311 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
271 lines
8.3 KiB
Dart
271 lines
8.3 KiB
Dart
import 'dart:io';
|
|
|
|
import 'package:flutter/foundation.dart';
|
|
import 'package:flutter_gemma/flutter_gemma.dart';
|
|
|
|
import '../../ai/tools/tool_definition.dart' as tools;
|
|
import 'llm_service.dart';
|
|
|
|
/// HuggingFace access token injected at build time via
|
|
/// `--dart-define=HF_TOKEN=hf_xxx`. Empty string is permitted —
|
|
/// flutter_gemma will only need it for the initial network download,
|
|
/// which our `ModelLifecycle` handles separately; activation from a
|
|
/// local file path generally does not require the token.
|
|
const String _hfToken = String.fromEnvironment('HF_TOKEN', defaultValue: '');
|
|
|
|
/// One-shot guard so [FlutterGemma.initialize] runs at most once per
|
|
/// isolate. Re-init is unsupported by the underlying plugin.
|
|
bool _initialized = false;
|
|
|
|
/// Real on-device LLM backend using flutter_gemma 0.16.5 + Gemma 4 E2B.
|
|
///
|
|
/// Wired into the existing #215 pipeline: `ModelLifecycle` downloads &
|
|
/// SHA-verifies the .litertlm file, then [load] registers that file with
|
|
/// flutter_gemma as the active model. [generateStructured] opens a
|
|
/// short-lived chat with a single [Tool] (Gemma 4 native function
|
|
/// calling) and returns the first matching [FunctionCallResponse]'s args.
|
|
///
|
|
/// Function-calling design notes (see fn-gemma_llm_service.md §B v2):
|
|
/// - Gemma 4 SDK injects the tool declaration via its chat template, so
|
|
/// we pass [Tool] to `createChat(tools: ...)` rather than appending a
|
|
/// schema instruction to the prompt (double-wrap risk).
|
|
/// - `ToolChoice.required` forces the model to emit a function call.
|
|
class GemmaLlmService implements LlmService {
|
|
final String modelPath;
|
|
|
|
GemmaLlmService({required this.modelPath});
|
|
|
|
InferenceModel? _model;
|
|
bool _loaded = false;
|
|
Future<void>? _loadingFuture;
|
|
|
|
@override
|
|
bool get isLoaded => _loaded;
|
|
|
|
/// #311 AC7: concurrent-call guard. If a load is already in-flight (e.g.
|
|
/// `ChatScreen` warm-up + a racing `userTurn` lazy load), return the same
|
|
/// Future so native init runs at most once per process.
|
|
/// See `docs/design/311-llm-warmup/fn-concurrent_load_guard.md`.
|
|
@override
|
|
Future<void> load() {
|
|
if (_loaded) return Future.value();
|
|
final existing = _loadingFuture;
|
|
if (existing != null) return existing;
|
|
final future = _doLoad();
|
|
_loadingFuture = future;
|
|
return future.whenComplete(() {
|
|
_loadingFuture = null;
|
|
});
|
|
}
|
|
|
|
Future<void> _doLoad() async {
|
|
if (!await File(modelPath).exists()) {
|
|
throw FileSystemException('model file missing', modelPath);
|
|
}
|
|
if (!_initialized) {
|
|
await FlutterGemma.initialize(huggingFaceToken: _hfToken);
|
|
_initialized = true;
|
|
}
|
|
await FlutterGemma.installModel(
|
|
modelType: ModelType.gemma4,
|
|
fileType: ModelFileType.litertlm,
|
|
).fromFile(modelPath).install();
|
|
final model = await FlutterGemma.getActiveModel(maxTokens: 2048);
|
|
_model = model;
|
|
_loaded = true;
|
|
}
|
|
|
|
@override
|
|
Future<void> unload() async {
|
|
final m = _model;
|
|
_model = null;
|
|
_loaded = false;
|
|
if (m != null) {
|
|
try {
|
|
await m.close();
|
|
} catch (_) {
|
|
// Best-effort cleanup — runtime may already be torn down.
|
|
}
|
|
}
|
|
}
|
|
|
|
@override
|
|
Future<Map<String, dynamic>> generateStructured(
|
|
String prompt,
|
|
Map<String, dynamic> schema,
|
|
) async {
|
|
if (!_loaded || _model == null) {
|
|
throw StateError('LlmService not loaded');
|
|
}
|
|
final fnName = schema['name'];
|
|
final fnParams = schema['parameters'];
|
|
if (fnName is! String || fnName.isEmpty) {
|
|
throw ArgumentError('schema.name missing');
|
|
}
|
|
if (fnParams is! Map) {
|
|
throw ArgumentError('schema.parameters missing');
|
|
}
|
|
final fnDesc = (schema['description'] as String?) ?? '';
|
|
final tool = Tool(
|
|
name: fnName,
|
|
description: fnDesc,
|
|
parameters: Map<String, dynamic>.from(fnParams),
|
|
);
|
|
|
|
final chat = await _model!.createChat(
|
|
modelType: ModelType.gemma4,
|
|
supportsFunctionCalls: true,
|
|
toolChoice: ToolChoice.required,
|
|
tools: [tool],
|
|
);
|
|
try {
|
|
await chat.addQueryChunk(Message.text(text: prompt, isUser: true));
|
|
final stream = chat.generateChatResponseAsync();
|
|
return await collectFunctionCall(stream, fnName);
|
|
} finally {
|
|
try {
|
|
await chat.close();
|
|
} catch (_) {
|
|
// Native session close failure is non-fatal — log + continue.
|
|
}
|
|
}
|
|
}
|
|
|
|
@override
|
|
Future<LlmChatSession> startChat({
|
|
required List<tools.ToolDefinition> tools,
|
|
}) async {
|
|
if (!_loaded || _model == null) {
|
|
throw StateError('LlmService not loaded');
|
|
}
|
|
final gemmaTools = tools
|
|
.map((t) => Tool(
|
|
name: t.name,
|
|
description: t.description,
|
|
parameters: Map<String, dynamic>.from(t.parametersSchema),
|
|
))
|
|
.toList();
|
|
final chat = await _model!.createChat(
|
|
modelType: ModelType.gemma4,
|
|
supportsFunctionCalls: true,
|
|
// ToolChoice.auto = 모델이 자율 결정 (multi-tool + reply-only 모두 지원).
|
|
toolChoice: ToolChoice.auto,
|
|
tools: gemmaTools,
|
|
);
|
|
return _GemmaChatSession(chat);
|
|
}
|
|
}
|
|
|
|
class _GemmaChatSession implements LlmChatSession {
|
|
final dynamic _chat;
|
|
bool _closed = false;
|
|
|
|
_GemmaChatSession(this._chat);
|
|
|
|
@override
|
|
Stream<LlmChatEvent> sendUser(String text) {
|
|
if (_closed) {
|
|
throw StateError('LlmChatSession is closed');
|
|
}
|
|
return _run(Message.text(text: text, isUser: true));
|
|
}
|
|
|
|
@override
|
|
Stream<LlmChatEvent> sendToolResult({
|
|
required String toolName,
|
|
required Map<String, dynamic> result,
|
|
}) {
|
|
if (_closed) {
|
|
throw StateError('LlmChatSession is closed');
|
|
}
|
|
return _run(Message.toolResponse(toolName: toolName, response: result));
|
|
}
|
|
|
|
Stream<LlmChatEvent> _run(Message msg) async* {
|
|
await _chat.addQueryChunk(msg);
|
|
final Stream<ModelResponse> stream = _chat.generateChatResponseAsync();
|
|
await for (final event in stream) {
|
|
if (event is TextResponse) {
|
|
yield LlmTextChunk(event.token);
|
|
} else if (event is FunctionCallResponse) {
|
|
yield LlmFunctionCall(
|
|
event.name,
|
|
Map<String, dynamic>.from(event.args),
|
|
);
|
|
return; // model hands control back to caller for tool exec
|
|
} else if (event is ParallelFunctionCallResponse &&
|
|
event.calls.isNotEmpty) {
|
|
// ADR-0005: parallel calls collapsed to first — sequential dispatch.
|
|
final first = event.calls.first;
|
|
yield LlmFunctionCall(
|
|
first.name,
|
|
Map<String, dynamic>.from(first.args),
|
|
);
|
|
return;
|
|
}
|
|
// ThinkingResponse / other: skip.
|
|
}
|
|
}
|
|
|
|
@override
|
|
Future<void> close() async {
|
|
if (_closed) return;
|
|
_closed = true;
|
|
try {
|
|
await _chat.close();
|
|
} catch (_) {
|
|
// Best-effort cleanup.
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Extracts the first `FunctionCallResponse(name == expectedName)` from
|
|
/// a flutter_gemma response stream. `TextResponse` / `ThinkingResponse`
|
|
/// events are skipped. A mismatched name throws fast.
|
|
///
|
|
/// File-private under `_collectFunctionCall` from [GemmaLlmService];
|
|
/// exposed as a top-level via `@visibleForTesting` so unit tests can
|
|
/// feed synthetic streams (see fn-spec §D, 8 test cases).
|
|
@visibleForTesting
|
|
Future<Map<String, dynamic>> collectFunctionCall(
|
|
Stream<ModelResponse> stream,
|
|
String expectedName,
|
|
) async {
|
|
Map<String, dynamic>? result;
|
|
String? wrongName;
|
|
try {
|
|
await for (final event in stream) {
|
|
if (event is FunctionCallResponse) {
|
|
if (event.name == expectedName) {
|
|
result = Map<String, dynamic>.from(event.args);
|
|
break;
|
|
} else {
|
|
wrongName = event.name;
|
|
break;
|
|
}
|
|
}
|
|
if (event is ParallelFunctionCallResponse && event.calls.isNotEmpty) {
|
|
final first = event.calls.first;
|
|
if (first.name == expectedName) {
|
|
result = Map<String, dynamic>.from(first.args);
|
|
} else {
|
|
wrongName = first.name;
|
|
}
|
|
break;
|
|
}
|
|
// TextResponse / ThinkingResponse: skip.
|
|
}
|
|
} catch (_) {
|
|
// Discard raw error to avoid leaking prompt content in logs/crash
|
|
// reports — the caller surfaces a generic message.
|
|
throw const FormatException('stream error');
|
|
}
|
|
if (wrongName != null) {
|
|
throw FormatException('unexpected function: $wrongName');
|
|
}
|
|
if (result == null) {
|
|
throw const FormatException('no function call emitted');
|
|
}
|
|
return result;
|
|
}
|