Add knowledge structuring feature with incremental LLM processing

- Add structured_content column and STRUCTURING pipeline step - Split LLM structuring into TOC + per-section calls to avoid token limit - Save intermediate results to DB for real-time frontend polling (3s) - Add manual "정리하기" button with async processing - Fix browser login modal by customizing authentication entry point - Fix standalone build symlinks for server.js and static files - Add troubleshooting guide Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 19:43:21 +00:00
parent afc9cdcde6
commit 9abb770e37
8 changed files with 547 additions and 31 deletions
--- a/sundol-backend/src/main/java/com/sundol/config/SecurityConfig.java
+++ b/sundol-backend/src/main/java/com/sundol/config/SecurityConfig.java
@@ -9,6 +9,8 @@ import org.springframework.security.config.annotation.web.reactive.EnableWebFlux
 import org.springframework.security.config.web.server.SecurityWebFiltersOrder;
 import org.springframework.security.config.web.server.ServerHttpSecurity;
 import org.springframework.security.web.server.SecurityWebFilterChain;
+import org.springframework.security.web.server.ServerAuthenticationEntryPoint;
+import org.springframework.http.HttpStatus;
 import org.springframework.web.cors.CorsConfiguration;
 import org.springframework.web.cors.reactive.CorsConfigurationSource;
 import org.springframework.web.cors.reactive.UrlBasedCorsConfigurationSource;
@@ -41,6 +43,12 @@ public class SecurityConfig {
                .addFilterAt(jwtAuthenticationFilter, SecurityWebFiltersOrder.AUTHENTICATION)
                .httpBasic(ServerHttpSecurity.HttpBasicSpec::disable)
                .formLogin(ServerHttpSecurity.FormLoginSpec::disable)
+                .exceptionHandling(exceptions -> exceptions
+                        .authenticationEntryPoint((exchange, ex) -> {
+                            exchange.getResponse().setStatusCode(HttpStatus.UNAUTHORIZED);
+                            return exchange.getResponse().setComplete();
+                        })
+                )
                .build();
    }

--- a/sundol-backend/src/main/java/com/sundol/controller/KnowledgeController.java
+++ b/sundol-backend/src/main/java/com/sundol/controller/KnowledgeController.java
@@ -86,6 +86,16 @@ public class KnowledgeController {
                .then(Mono.just(ResponseEntity.ok().<Void>build()));
    }

+    @PostMapping("/{id}/structure")
+    public Mono<ResponseEntity<Map<String, Object>>> structure(
+            @AuthenticationPrincipal String userId,
+            @PathVariable String id,
+            @RequestBody(required = false) Map<String, String> body) {
+        String modelId = body != null ? body.get("modelId") : null;
+        return knowledgeService.structureContent(userId, id, modelId)
+                .map(ResponseEntity::ok);
+    }
+
    @GetMapping("/{id}/chunks")
    public Mono<ResponseEntity<List<Map<String, Object>>>> getChunks(
            @AuthenticationPrincipal String userId,
--- a/sundol-backend/src/main/java/com/sundol/repository/KnowledgeRepository.java
+++ b/sundol-backend/src/main/java/com/sundol/repository/KnowledgeRepository.java
@@ -55,7 +55,7 @@ public class KnowledgeRepository {

    public Map<String, Object> findById(String userId, String id) {
        var results = jdbcTemplate.queryForList(
-            "SELECT RAWTOHEX(id) AS id, RAWTOHEX(user_id) AS user_id, type, title, source_url, raw_text, status, created_at, updated_at " +
+            "SELECT RAWTOHEX(id) AS id, RAWTOHEX(user_id) AS user_id, type, title, source_url, raw_text, structured_content, status, created_at, updated_at " +
            "FROM knowledge_items WHERE RAWTOHEX(id) = ? AND user_id = HEXTORAW(?)",
            id, userId
        );
@@ -64,7 +64,7 @@ public class KnowledgeRepository {

    public Map<String, Object> findByIdInternal(String id) {
        var results = jdbcTemplate.queryForList(
-            "SELECT RAWTOHEX(id) AS id, RAWTOHEX(user_id) AS user_id, type, title, source_url, raw_text, status, created_at, updated_at " +
+            "SELECT RAWTOHEX(id) AS id, RAWTOHEX(user_id) AS user_id, type, title, source_url, raw_text, structured_content, status, created_at, updated_at " +
            "FROM knowledge_items WHERE RAWTOHEX(id) = ?",
            id
        );
@@ -85,6 +85,13 @@ public class KnowledgeRepository {
        );
    }

+    public void updateStructuredContent(String id, String structuredContent) {
+        jdbcTemplate.update(
+            "UPDATE knowledge_items SET structured_content = ?, updated_at = SYSTIMESTAMP WHERE RAWTOHEX(id) = ?",
+            structuredContent, id
+        );
+    }
+
    public void delete(String userId, String id) {
        jdbcTemplate.update(
            "DELETE FROM knowledge_items WHERE RAWTOHEX(id) = ? AND user_id = HEXTORAW(?)",
--- a/sundol-backend/src/main/java/com/sundol/service/IngestPipelineService.java
+++ b/sundol-backend/src/main/java/com/sundol/service/IngestPipelineService.java
@@ -50,6 +50,140 @@ public class IngestPipelineService {

    private static final int TITLE_MAX_LENGTH = 80;
    private static final int TEXT_PREVIEW_LENGTH = 3000;
+    private static final int STRUCTURING_MIN_LENGTH = 1000;
+
+    /**
+     * LLM으로 콘텐츠를 구조화: Abstract + 목차 + 목차별 상세 정리.
+     * 1000자 이상일 때만 실행.
+     * 1차 호출: Abstract + 목차 생성, 2차~ 호출: 목차별 상세 정리, 최종 조합.
+     */
+    public String structureContent(String text, String modelId, String knowledgeItemId) {
+        if (!genAiService.isConfigured()) {
+            log.info("OCI GenAI not configured, skipping structuring");
+            return null;
+        }
+
+        if (text.length() < STRUCTURING_MIN_LENGTH) {
+            log.info("Content too short for structuring ({} chars), skipping", text.length());
+            return null;
+        }
+
+        try {
+            String content = text.length() > 30000 ? text.substring(0, 30000) : text;
+
+            // === 1차 호출: Abstract + 목차 생성 ===
+            String tocSystemMsg =
+                    "당신은 콘텐츠 분석 전문가입니다. 주어진 원본 텍스트를 분석하여 요약과 목차만 생성해주세요.\n\n" +
+                    "## 규칙\n" +
+                    "1. 원본 언어와 같은 언어로 작성하세요.\n" +
+                    "2. Markdown 형식으로 작성하세요.\n" +
+                    "3. 아래 구조를 반드시 따르세요:\n\n" +
+                    "```\n" +
+                    "# 요약 (Abstract)\n" +
+                    "(핵심 내용을 3~5문장으로 요약)\n\n" +
+                    "# 목차\n" +
+                    "1. 첫 번째 주제\n" +
+                    "2. 두 번째 주제\n" +
+                    "...\n" +
+                    "```\n\n" +
+                    "4. 목차 항목은 원본 내용의 흐름에 맞게 논리적으로 나누세요.\n" +
+                    "5. 목차는 5~15개 사이로 적절히 나누세요.\n" +
+                    "6. 목차에는 번호와 제목만 넣고, 상세 내용은 넣지 마세요.\n" +
+                    "7. 원본에 없는 내용을 추가하지 마세요.";
+
+            String tocUserMsg = "아래 원본 텍스트의 요약과 목차를 생성해주세요:\n\n" + content;
+            String tocResult = genAiService.chat(tocSystemMsg, tocUserMsg, modelId).strip();
+            log.info("Phase 1 - TOC generated: {} chars", tocResult.length());
+
+            // 1차 결과 중간 저장
+            if (knowledgeItemId != null) {
+                knowledgeRepository.updateStructuredContent(knowledgeItemId, tocResult);
+            }
+
+            // 목차 항목 파싱
+            List<String> tocItems = parseTocItems(tocResult);
+            if (tocItems.isEmpty()) {
+                log.warn("No TOC items parsed, returning TOC-only result");
+                return tocResult;
+            }
+            log.info("Parsed {} TOC items: {}", tocItems.size(), tocItems);
+
+            // === 2차~ 호출: 목차별 상세 정리 ===
+            StringBuilder fullResult = new StringBuilder(tocResult).append("\n\n");
+
+            String sectionSystemMsg =
+                    "당신은 콘텐츠 정리 전문가입니다. 주어진 원본 텍스트에서 지정된 섹션에 해당하는 내용만 상세히 정리해주세요.\n\n" +
+                    "## 규칙\n" +
+                    "1. 원본의 의미를 절대 왜곡하거나 생략하지 마세요. 디테일을 최대한 살려주세요.\n" +
+                    "2. 원본 언어와 같은 언어로 작성하세요.\n" +
+                    "3. Markdown 형식으로 작성하세요.\n" +
+                    "4. 불릿 포인트, 번호 매기기, 굵은 글씨 등을 활용하여 가독성을 높이세요.\n" +
+                    "5. 원본에 없는 내용을 추가하지 마세요.\n" +
+                    "6. 해당 섹션과 관련 없는 내용은 포함하지 마세요.\n" +
+                    "7. 섹션 제목은 '# 번호. 제목' 형식으로 시작하세요.";
+
+            for (int i = 0; i < tocItems.size(); i++) {
+                String tocItem = tocItems.get(i);
+                String sectionUserMsg = "원본 텍스트에서 아래 섹션에 해당하는 내용을 상세히 정리해주세요.\n\n" +
+                        "## 정리할 섹션\n" +
+                        (i + 1) + ". " + tocItem + "\n\n" +
+                        "## 원본 텍스트\n" + content;
+
+                try {
+                    String sectionResult = genAiService.chat(sectionSystemMsg, sectionUserMsg, modelId).strip();
+                    fullResult.append(sectionResult).append("\n\n");
+                    log.info("Phase 2 - Section {} '{}' generated: {} chars", i + 1, tocItem, sectionResult.length());
+                } catch (Exception e) {
+                    log.warn("Failed to generate section {}: {}", i + 1, e.getMessage());
+                    fullResult.append("# ").append(i + 1).append(". ").append(tocItem).append("\n\n")
+                              .append("(정리 실패)\n\n");
+                }
+
+                // 섹션 완료될 때마다 중간 저장 (프론트엔드 폴링용)
+                if (knowledgeItemId != null) {
+                    knowledgeRepository.updateStructuredContent(knowledgeItemId, fullResult.toString().strip());
+                }
+            }
+
+            String result = fullResult.toString().strip();
+            log.info("Structured content generated: {} chars ({} sections)", result.length(), tocItems.size());
+            return result;
+        } catch (Exception e) {
+            log.warn("Content structuring failed", e);
+            return null;
+        }
+    }
+
+    /**
+     * 목차 텍스트에서 항목들을 파싱한다.
+     * "1. 첫 번째 주제" 형태의 줄을 추출.
+     */
+    private List<String> parseTocItems(String tocText) {
+        List<String> items = new java.util.ArrayList<>();
+        java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("^\\d+\\.\\s+(.+)$", java.util.regex.Pattern.MULTILINE);
+        java.util.regex.Matcher matcher = pattern.matcher(tocText);
+
+        // "# 목차" 이후의 내용만 파싱
+        int tocStart = tocText.indexOf("# 목차");
+        if (tocStart == -1) tocStart = tocText.indexOf("# Table of Contents");
+        if (tocStart == -1) tocStart = 0;
+        String tocSection = tocText.substring(tocStart);
+
+        // 목차 섹션 이후 다음 '#'이 나오기 전까지만 파싱
+        int nextSection = tocSection.indexOf("\n#", 2);
+        if (nextSection > 0) {
+            tocSection = tocSection.substring(0, nextSection);
+        }
+
+        matcher = pattern.matcher(tocSection);
+        while (matcher.find()) {
+            String item = matcher.group(1).strip();
+            if (!item.isBlank()) {
+                items.add(item);
+            }
+        }
+        return items;
+    }

    /**
     * LLM으로 내용 기반 제목 생성. 실패 시 텍스트 앞부분으로 폴백.
@@ -178,6 +312,24 @@ public class IngestPipelineService {
        }
    }

+    /**
+     * 수동 구조화 요청 (비동기). 프론트엔드 버튼에서 호출.
+     */
+    @Async
+    public void runStructuring(String knowledgeItemId, String text, String modelId) {
+        try {
+            String structured = structureContent(text, modelId, knowledgeItemId);
+            if (structured != null && !structured.isBlank()) {
+                knowledgeRepository.updateStructuredContent(knowledgeItemId, structured);
+            }
+            knowledgeRepository.updateStatus(knowledgeItemId, "READY");
+            log.info("Manual structuring complete for item {}", knowledgeItemId);
+        } catch (Exception e) {
+            log.error("Manual structuring failed for item {}", knowledgeItemId, e);
+            knowledgeRepository.updateStatus(knowledgeItemId, "READY");
+        }
+    }
+
    @Async
    public void runPipeline(String knowledgeItemId, String modelId) {
        try {
@@ -243,7 +395,19 @@ public class IngestPipelineService {
                knowledgeRepository.updateTitle(knowledgeItemId, autoTitle);
            }

-            // Step 2: Chunk
+            // Step 2: Structure content (1000자 이상일 때만)
+            knowledgeRepository.updateStatus(knowledgeItemId, "STRUCTURING");
+            try {
+                String structured = structureContent(extractedText, modelId, knowledgeItemId);
+                if (structured != null && !structured.isBlank()) {
+                    knowledgeRepository.updateStructuredContent(knowledgeItemId, structured);
+                    log.info("Item {} structured: {} chars", knowledgeItemId, structured.length());
+                }
+            } catch (Exception e) {
+                log.warn("Structuring failed for item {}, continuing pipeline", knowledgeItemId, e);
+            }
+
+            // Step 3: Chunk
            knowledgeRepository.updateStatus(knowledgeItemId, "CHUNKING");
            List<String> chunks = chunkingService.chunk(extractedText);
            log.info("Item {} chunked into {} pieces", knowledgeItemId, chunks.size());
@@ -254,11 +418,11 @@ public class IngestPipelineService {
                chunkRepository.insertChunk(knowledgeItemId, i, chunkContent, tokenCount);
            }

-            // Step 3: Categorize
+            // Step 4: Categorize
            knowledgeRepository.updateStatus(knowledgeItemId, "CATEGORIZING");
            categorize(knowledgeItemId, (String) item.get("USER_ID"), extractedText, modelId);

-            // Step 4: Embedding
+            // Step 5: Embedding
            knowledgeRepository.updateStatus(knowledgeItemId, "EMBEDDING");
            embedChunks(knowledgeItemId, chunks);

--- a/sundol-backend/src/main/java/com/sundol/service/KnowledgeService.java
+++ b/sundol-backend/src/main/java/com/sundol/service/KnowledgeService.java
@@ -76,6 +76,42 @@ public class KnowledgeService {
        }).subscribeOn(Schedulers.boundedElastic());
    }

+    public Mono<Map<String, Object>> structureContent(String userId, String id, String modelId) {
+        return Mono.fromCallable(() -> {
+            Map<String, Object> item = knowledgeRepository.findById(userId, id);
+            if (item == null) {
+                throw new AppException(HttpStatus.NOT_FOUND, "Knowledge item not found");
+            }
+
+            // raw_text 또는 청크에서 원본 텍스트 가져오기
+            Object rawTextObj = item.get("RAW_TEXT");
+            String text = rawTextObj != null ? rawTextObj.toString() : null;
+
+            if (text == null || text.isBlank()) {
+                // WEB 타입은 raw_text가 없을 수 있으므로 청크에서 조합
+                var chunks = chunkRepository.findByKnowledgeItemId(id);
+                if (!chunks.isEmpty()) {
+                    text = chunks.stream()
+                            .map(c -> c.get("CONTENT").toString())
+                            .collect(java.util.stream.Collectors.joining("\n\n"));
+                }
+            }
+
+            if (text == null || text.isBlank()) {
+                throw new AppException(HttpStatus.BAD_REQUEST, "No content to structure");
+            }
+
+            // STRUCTURING 상태로 변경 (프론트엔드 폴링에서 진행 중 표시)
+            knowledgeRepository.updateStatus(id, "STRUCTURING");
+
+            // 비동기로 구조화 실행 (중간 결과는 pipelineService가 DB에 직접 저장)
+            final String finalText = text;
+            pipelineService.runStructuring(id, finalText, modelId);
+
+            return knowledgeRepository.findById(userId, id);
+        }).subscribeOn(Schedulers.boundedElastic());
+    }
+
    public Mono<Void> delete(String userId, String id) {
        return Mono.fromRunnable(() -> knowledgeRepository.delete(userId, id))
                .subscribeOn(Schedulers.boundedElastic()).then();