life-helper/app/lib/data/ai/gemma_llm_service.dart

import 'dart:io';

import 'package:flutter/foundation.dart';
import 'package:flutter_gemma/flutter_gemma.dart';

import '../../ai/tools/tool_definition.dart' as tools;
import 'llm_service.dart';

/// HuggingFace access token injected at build time via
/// `--dart-define=HF_TOKEN=hf_xxx`. Empty string is permitted —
/// flutter_gemma will only need it for the initial network download,
/// which our `ModelLifecycle` handles separately; activation from a
/// local file path generally does not require the token.
const String _hfToken = String.fromEnvironment('HF_TOKEN', defaultValue: '');

/// One-shot guard so [FlutterGemma.initialize] runs at most once per
/// isolate. Re-init is unsupported by the underlying plugin.
bool _initialized = false;

/// Real on-device LLM backend using flutter_gemma 0.16.5 + Gemma 4 E2B.
///
/// Wired into the existing #215 pipeline: `ModelLifecycle` downloads &
/// SHA-verifies the .litertlm file, then [load] registers that file with
/// flutter_gemma as the active model. [generateStructured] opens a
/// short-lived chat with a single [Tool] (Gemma 4 native function
/// calling) and returns the first matching [FunctionCallResponse]'s args.
///
/// Function-calling design notes (see fn-gemma_llm_service.md §B v2):
/// - Gemma 4 SDK injects the tool declaration via its chat template, so
///   we pass [Tool] to `createChat(tools: ...)` rather than appending a
///   schema instruction to the prompt (double-wrap risk).
/// - `ToolChoice.required` forces the model to emit a function call.
class GemmaLlmService implements LlmService {
  final String modelPath;

  GemmaLlmService({required this.modelPath});

  InferenceModel? _model;
  bool _loaded = false;
  Future<void>? _loadingFuture;

  @override
  bool get isLoaded => _loaded;

  /// #311 AC7: concurrent-call guard. If a load is already in-flight (e.g.
  /// `ChatScreen` warm-up + a racing `userTurn` lazy load), return the same
  /// Future so native init runs at most once per process.
  /// See `docs/design/311-llm-warmup/fn-concurrent_load_guard.md`.
  @override
  Future<void> load() {
    if (_loaded) return Future.value();
    final existing = _loadingFuture;
    if (existing != null) return existing;
    final future = _doLoad();
    _loadingFuture = future;
    return future.whenComplete(() {
      _loadingFuture = null;
    });
  }

  Future<void> _doLoad() async {
    if (!await File(modelPath).exists()) {
      throw FileSystemException('model file missing', modelPath);
    }
    if (!_initialized) {
      await FlutterGemma.initialize(huggingFaceToken: _hfToken);
      _initialized = true;
    }
    await FlutterGemma.installModel(
      modelType: ModelType.gemma4,
      fileType: ModelFileType.litertlm,
    ).fromFile(modelPath).install();
    final model = await FlutterGemma.getActiveModel(maxTokens: 2048);
    _model = model;
    _loaded = true;
  }

  @override
  Future<void> unload() async {
    final m = _model;
    _model = null;
    _loaded = false;
    if (m != null) {
      try {
        await m.close();
      } catch (_) {
        // Best-effort cleanup — runtime may already be torn down.
      }
    }
  }

  @override
  Future<Map<String, dynamic>> generateStructured(
    String prompt,
    Map<String, dynamic> schema,
  ) async {
    if (!_loaded || _model == null) {
      throw StateError('LlmService not loaded');
    }
    final fnName = schema['name'];
    final fnParams = schema['parameters'];
    if (fnName is! String || fnName.isEmpty) {
      throw ArgumentError('schema.name missing');
    }
    if (fnParams is! Map) {
      throw ArgumentError('schema.parameters missing');
    }
    final fnDesc = (schema['description'] as String?) ?? '';
    final tool = Tool(
      name: fnName,
      description: fnDesc,
      parameters: Map<String, dynamic>.from(fnParams),
    );

    final chat = await _model!.createChat(
      modelType: ModelType.gemma4,
      supportsFunctionCalls: true,
      toolChoice: ToolChoice.required,
      tools: [tool],
    );
    try {
      await chat.addQueryChunk(Message.text(text: prompt, isUser: true));
      final stream = chat.generateChatResponseAsync();
      return await collectFunctionCall(stream, fnName);
    } finally {
      try {
        await chat.close();
      } catch (_) {
        // Native session close failure is non-fatal — log + continue.
      }
    }
  }

  @override
  Future<LlmChatSession> startChat({
    required List<tools.ToolDefinition> tools,
  }) async {
    if (!_loaded || _model == null) {
      throw StateError('LlmService not loaded');
    }
    final gemmaTools = tools
        .map((t) => Tool(
              name: t.name,
              description: t.description,
              parameters: Map<String, dynamic>.from(t.parametersSchema),
            ))
        .toList();
    final chat = await _model!.createChat(
      modelType: ModelType.gemma4,
      supportsFunctionCalls: true,
      // ToolChoice.auto = 모델이 자율 결정 (multi-tool + reply-only 모두 지원).
      toolChoice: ToolChoice.auto,
      tools: gemmaTools,
    );
    return _GemmaChatSession(chat);
  }
}

class _GemmaChatSession implements LlmChatSession {
  final dynamic _chat;
  bool _closed = false;

  _GemmaChatSession(this._chat);

  @override
  Stream<LlmChatEvent> sendUser(String text) {
    if (_closed) {
      throw StateError('LlmChatSession is closed');
    }
    return _run(Message.text(text: text, isUser: true));
  }

  @override
  Stream<LlmChatEvent> sendToolResult({
    required String toolName,
    required Map<String, dynamic> result,
  }) {
    if (_closed) {
      throw StateError('LlmChatSession is closed');
    }
    return _run(Message.toolResponse(toolName: toolName, response: result));
  }

  Stream<LlmChatEvent> _run(Message msg) async* {
    await _chat.addQueryChunk(msg);
    final Stream<ModelResponse> stream = _chat.generateChatResponseAsync();
    await for (final event in stream) {
      if (event is TextResponse) {
        yield LlmTextChunk(event.token);
      } else if (event is FunctionCallResponse) {
        yield LlmFunctionCall(
          event.name,
          Map<String, dynamic>.from(event.args),
        );
        return; // model hands control back to caller for tool exec
      } else if (event is ParallelFunctionCallResponse &&
          event.calls.isNotEmpty) {
        // ADR-0005: parallel calls collapsed to first — sequential dispatch.
        final first = event.calls.first;
        yield LlmFunctionCall(
          first.name,
          Map<String, dynamic>.from(first.args),
        );
        return;
      }
      // ThinkingResponse / other: skip.
    }
  }

  @override
  Future<void> close() async {
    if (_closed) return;
    _closed = true;
    try {
      await _chat.close();
    } catch (_) {
      // Best-effort cleanup.
    }
  }
}

/// Extracts the first `FunctionCallResponse(name == expectedName)` from
/// a flutter_gemma response stream. `TextResponse` / `ThinkingResponse`
/// events are skipped. A mismatched name throws fast.
///
/// File-private under `_collectFunctionCall` from [GemmaLlmService];
/// exposed as a top-level via `@visibleForTesting` so unit tests can
/// feed synthetic streams (see fn-spec §D, 8 test cases).
@visibleForTesting
Future<Map<String, dynamic>> collectFunctionCall(
  Stream<ModelResponse> stream,
  String expectedName,
) async {
  Map<String, dynamic>? result;
  String? wrongName;
  try {
    await for (final event in stream) {
      if (event is FunctionCallResponse) {
        if (event.name == expectedName) {
          result = Map<String, dynamic>.from(event.args);
          break;
        } else {
          wrongName = event.name;
          break;
        }
      }
      if (event is ParallelFunctionCallResponse && event.calls.isNotEmpty) {
        final first = event.calls.first;
        if (first.name == expectedName) {
          result = Map<String, dynamic>.from(first.args);
        } else {
          wrongName = first.name;
        }
        break;
      }
      // TextResponse / ThinkingResponse: skip.
    }
  } catch (_) {
    // Discard raw error to avoid leaking prompt content in logs/crash
    // reports — the caller surfaces a generic message.
    throw const FormatException('stream error');
  }
  if (wrongName != null) {
    throw FormatException('unexpected function: $wrongName');
  }
  if (result == null) {
    throw const FormatException('no function call emitted');
  }
  return result;
}