From cbc5ba5663d26a26b5b392b16377662aa78b50aa Mon Sep 17 00:00:00 2001 From: joungmin Date: Wed, 10 Jun 2026 04:36:50 +0000 Subject: [PATCH] =?UTF-8?q?=EC=A0=95=EB=B6=80=EC=A7=80=EC=9B=90=EC=82=AC?= =?UTF-8?q?=EC=97=85=20=EA=B3=B5=EA=B3=A0=20=EC=88=98=EC=A7=91=20=EB=8D=B0?= =?UTF-8?q?=EB=AA=AC(gov-scraper)=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - government/ Node 데몬: Open API 우선 + HTML 보조 + 디스커버리 전략 - Strategy 패턴 소스 어댑터: KStartupApiSource(공공데이터 Open API), GenericHtmlSource(config 기반) - sundol 3단계 폴백 크롤러(cheerio→Jina→Playwright CDP) Node 재구현, sundol-chrome(9222) 재사용 - Oracle thick 모드(Instant Client + sso 지갑) 접속, gov_source/gov_opportunity 적재(중복제거) - K-Startup 29,017건 + 중기부(mss) 30건 적재 검증, PM2 gov-daemon 등록(60분 주기) - 기업마당(bizinfo)은 자체 crtfcKey 발급 대기 Co-Authored-By: Claude Opus 4.8 (1M context) --- ecosystem.config.cjs | 10 + government/.gitignore | 4 + government/README.md | 78 +++++ government/db/schema.sql | 54 ++++ government/docs/sources-catalog.md | 27 ++ government/package-lock.json | 349 +++++++++++++++++++++++ government/package.json | 19 ++ government/src/bootstrap.js | 19 ++ government/src/cli.js | 57 ++++ government/src/config.js | 43 +++ government/src/crawler/browser.js | 68 +++++ government/src/crawler/crawler.js | 117 ++++++++ government/src/daemon.js | 62 ++++ government/src/db.js | 60 ++++ government/src/logger.js | 10 + government/src/pipeline.js | 91 ++++++ government/src/sources/base.js | 62 ++++ government/src/sources/genericHtml.js | 126 ++++++++ government/src/sources/htmlSources.js | 28 ++ government/src/sources/kstartup.js | 92 ++++++ government/src/sources/registry.js | 32 +++ government/src/store/opportunityStore.js | 197 +++++++++++++ government/src/util.js | 34 +++ 23 files changed, 1639 insertions(+) create mode 100644 government/.gitignore create mode 100644 government/README.md create mode 100644 government/db/schema.sql create mode 100644 government/docs/sources-catalog.md create mode 100644 government/package-lock.json create mode 100644 government/package.json create mode 100644 government/src/bootstrap.js create mode 100644 government/src/cli.js create mode 100644 government/src/config.js create mode 100644 government/src/crawler/browser.js create mode 100644 government/src/crawler/crawler.js create mode 100644 government/src/daemon.js create mode 100644 government/src/db.js create mode 100644 government/src/logger.js create mode 100644 government/src/pipeline.js create mode 100644 government/src/sources/base.js create mode 100644 government/src/sources/genericHtml.js create mode 100644 government/src/sources/htmlSources.js create mode 100644 government/src/sources/kstartup.js create mode 100644 government/src/sources/registry.js create mode 100644 government/src/store/opportunityStore.js create mode 100644 government/src/util.js diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs index c9629e4..59a40e9 100644 --- a/ecosystem.config.cjs +++ b/ecosystem.config.cjs @@ -31,5 +31,15 @@ module.exports = { NEXT_PUBLIC_GOOGLE_CLIENT_ID: "906390686133-vpqsisodkg6uqui469hg8dhupbejoa0d.apps.googleusercontent.com", }, }, + { + name: "gov-daemon", + script: "src/daemon.js", + interpreter: "/usr/local/bin/node", + cwd: "/home/opc/sundol/government", + env: { + // Oracle Instant Client(thick 모드) 의존 라이브러리 경로 + LD_LIBRARY_PATH: "/home/opc/oracle-ic/instantclient_23_26", + }, + }, ], }; diff --git a/government/.gitignore b/government/.gitignore new file mode 100644 index 0000000..b65837d --- /dev/null +++ b/government/.gitignore @@ -0,0 +1,4 @@ +node_modules/ +*.log +# DB 접속 net 설정(지갑 경로/접속 디스크립터) — 환경별 재생성 +oracle-net/ diff --git a/government/README.md b/government/README.md new file mode 100644 index 0000000..876fa59 --- /dev/null +++ b/government/README.md @@ -0,0 +1,78 @@ +# 정부지원사업 수집 데몬 (gov-scraper) + +한국 정부지원사업 공고를 주기적으로 수집해 Oracle DB(`gov_opportunity`)에 적재하는 Node.js 데몬. + +## 전략 + +**Open API 우선 + HTML 보조 + 디스커버리 확장.** 정부지원사업 공고는 소수 허브 포털에 +대부분 집계되므로, API가 있는 곳은 API로(안 깨짐), 없는 곳은 HTML로 긁고, 부족분은 +디스커버리로 소스를 넓힌다. + +## 아키텍처 + +``` +src/ +├── config.js 환경설정(루트 .env 로드) +├── bootstrap.js LD_LIBRARY_PATH(Instant Client) 보정 후 재실행 +├── db.js Oracle thick 모드 접속(sso 지갑 재사용) +├── crawler/ +│ ├── browser.js sundol-chrome(CDP 9222) 연결 — 기존 인프라 재사용 +│ └── crawler.js 3단계 폴백(cheerio → Jina → Playwright) [Facade] +├── sources/ [Strategy] 소스별 어댑터 +│ ├── base.js OpportunitySource 인터페이스 +│ ├── kstartup.js K-Startup Open API (data.go.kr 서비스키) +│ ├── genericHtml.js config 기반 범용 HTML 게시판 스크래퍼 +│ ├── htmlSources.js HTML 소스 config 목록(여기에 추가) +│ └── registry.js 가용 소스 집계(키 없는 소스 자동 제외) +├── store/ gov_source/gov_opportunity 적재(중복제거) +├── pipeline.js 목록 수집 → 적재 → 상세 본문 수집 +├── daemon.js 주기 폴링 데몬(PM2) +└── cli.js 수동 실행(test-db / test-crawl / run-once) +``` + +- **중복 제거**: `(source_code, external_id)` 유니크 키. external_id 는 API 고유키(pbanc_sn 등) + 또는 게시판 글번호. +- **상세 본문**: API 소스는 목록 단계에서 본문까지 한 번에 적재(단일 패스). HTML 소스는 + 목록 적재 후 detail_url 을 3단계 크롤러로 긁는 2-패스. + +## DB 접속 (중요) + +node-oracledb **thick 모드** + Oracle Instant Client 를 쓴다. 백엔드 JDBC 와 동일하게 +자동로그인 지갑(`cwallet.sso`)을 재사용하므로 **지갑 비밀번호가 필요 없다**. +(thin 모드는 sso 를 못 읽어 지갑 비밀번호가 필요한데, 그 비밀번호는 어디에도 저장돼 있지 않음) + +- Instant Client: `/home/opc/oracle-ic/instantclient_23_26` (`.env` 의 `ORACLE_IC_LIB_DIR`) +- net 설정: `government/oracle-net/` — 지갑의 `sqlnet.ora` 가 `WALLET_LOCATION` 을 + `?/network/admin` 로 가리켜 instant client 가 sso 를 못 여는 문제를 보정한 전용 설정. + +## 실행 + +```bash +cd government +node src/cli.js test-db # DB 접속 확인 +node src/cli.js run-once kstartup # K-Startup 1회 수집 +node src/cli.js run-once mss # 중기부 게시판 1회 수집 +node src/cli.js run-once # 가용 소스 전체 1회 +node src/cli.js test-crawl # 크롤러 단독 테스트 + +# 데몬(PM2) +pm2 start /home/opc/sundol/ecosystem.config.cjs --only gov-daemon +pm2 logs gov-daemon +``` + +## 새 소스 추가 + +- **HTML 게시판**: `src/sources/htmlSources.js` 의 `HTML_SOURCE_CONFIGS` 에 항목 추가 + (listUrl, rowSelector, externalId 정규식, detailUrl 템플릿). 코드 로직 수정 불필요. +- **API**: `src/sources/` 에 `OpportunitySource` 상속 어댑터 작성 후 `registry.js` 등록. + +## 디스커버리 (소스 발굴) + +데몬 자체는 웹 검색을 못 하므로, 신규 소스 발굴은 Claude(WebSearch)가 수행해 +`htmlSources.js` 또는 `gov_source` 에 등록한다. 후보 목록은 `docs/sources-catalog.md` 참조. + +## 미완 / TODO + +- **기업마당(bizinfo)**: 자체 인증키(`crtfcKey`, bizinfo.go.kr 별도 신청) 필요. + `.env` 의 `BIZINFO_CRTFC_KEY` 발급 후 어댑터 추가 예정. (data.go.kr 키와 별개) +- 중소벤처24(smes), 지자체/부처 게시판 추가. diff --git a/government/db/schema.sql b/government/db/schema.sql new file mode 100644 index 0000000..264566b --- /dev/null +++ b/government/db/schema.sql @@ -0,0 +1,54 @@ +-- 정부지원사업 스크래퍼 스키마 +-- 기존 sundol 컨벤션 준수: snake_case 테이블, RAW(16) id(SYS_GUID()), TIMESTAMP(SYSTIMESTAMP) +-- 실행: SQLcl 에서 @government/db/schema.sql + +-- ============================================================ +-- gov_source : 공고 소스(사이트) 목록. Strategy 어댑터가 이 행을 읽어 동작한다. +-- ============================================================ +CREATE TABLE gov_source ( + id RAW(16) DEFAULT SYS_GUID() PRIMARY KEY, + code VARCHAR2(50) NOT NULL, -- 어댑터 식별자 (예: kstartup, bizinfo, smes) + name VARCHAR2(200) NOT NULL, -- 표시명 + base_url VARCHAR2(500), -- 기준 URL + type VARCHAR2(20) NOT NULL, -- API | HTML + config CLOB, -- 어댑터 설정(JSON): endpoint, params, selectors 등 + active NUMBER(1) DEFAULT 1 NOT NULL, -- 1=활성, 0=비활성 + last_crawled_at TIMESTAMP, + created_at TIMESTAMP DEFAULT SYSTIMESTAMP NOT NULL, + updated_at TIMESTAMP DEFAULT SYSTIMESTAMP NOT NULL, + CONSTRAINT gov_source_code_uq UNIQUE (code), + CONSTRAINT gov_source_type_ck CHECK (type IN ('API', 'HTML')), + CONSTRAINT gov_source_active_ck CHECK (active IN (0, 1)) +); + +-- ============================================================ +-- gov_opportunity : 수집된 공고. (source_code, external_id) 로 중복 제거. +-- external_id 는 항상 채운다. HTML 소스는 detail_url 해시로 채운다. +-- ============================================================ +CREATE TABLE gov_opportunity ( + id RAW(16) DEFAULT SYS_GUID() PRIMARY KEY, + source_id RAW(16) NOT NULL, + source_code VARCHAR2(50) NOT NULL, -- 비정규화(조회 편의) + external_id VARCHAR2(200) NOT NULL, -- 소스 고유 키(pbancSn 등) 또는 detail_url 해시 + title VARCHAR2(1000 CHAR) NOT NULL, + agency VARCHAR2(300 CHAR), -- 소관/주관기관 + category VARCHAR2(200 CHAR), -- 지원분야 + target VARCHAR2(1000 CHAR), -- 지원대상 + apply_start DATE, + apply_end DATE, + detail_url VARCHAR2(1000), + body_text CLOB, -- 상세 본문(스크랩) + raw_json CLOB, -- 원본 API/스크랩 데이터 + status VARCHAR2(20) DEFAULT 'LISTED' NOT NULL, -- LISTED | DETAILED | CLOSED | ERROR + list_collected_at TIMESTAMP, -- 목록 수집 시각 + detail_collected_at TIMESTAMP, -- 상세 수집 시각 + created_at TIMESTAMP DEFAULT SYSTIMESTAMP NOT NULL, + updated_at TIMESTAMP DEFAULT SYSTIMESTAMP NOT NULL, + CONSTRAINT gov_opp_source_fk FOREIGN KEY (source_id) REFERENCES gov_source (id), + CONSTRAINT gov_opp_dedup_uq UNIQUE (source_code, external_id), + CONSTRAINT gov_opp_status_ck CHECK (status IN ('LISTED', 'DETAILED', 'CLOSED', 'ERROR')) +); + +CREATE INDEX gov_opp_status_ix ON gov_opportunity (status); +CREATE INDEX gov_opp_apply_end_ix ON gov_opportunity (apply_end); +CREATE INDEX gov_opp_source_ix ON gov_opportunity (source_id); diff --git a/government/docs/sources-catalog.md b/government/docs/sources-catalog.md new file mode 100644 index 0000000..f7e4e40 --- /dev/null +++ b/government/docs/sources-catalog.md @@ -0,0 +1,27 @@ +# 정부지원사업 소스 카탈로그 (디스커버리 결과) + +Claude WebSearch 로 수집한 공고 소스 후보. 상태가 `구현`인 것만 데몬이 수집한다. + +| 코드 | 소스 | URL | 방식 | 키 | 상태 | +|---|---|---|---|---|---| +| kstartup | K-Startup 창업지원 공고 | k-startup.go.kr | Open API | data.go.kr 서비스키 | ✅ 구현·검증 | +| mss | 중소벤처기업부 사업공고 | mss.go.kr (cbIdx=310) | HTML 게시판 | 불필요 | ✅ 구현·검증 | +| bizinfo | 기업마당 지원사업정보 | bizinfo.go.kr | Open API(자체) | bizinfo `crtfcKey`(별도신청) | ⏳ 키 대기 | +| smes | 중소벤처24 사업공고 | smes.go.kr | HTML | 불필요 | 🔲 후보 | +| g2b | 나라장터(입찰/조달) | g2b.go.kr | Open API(data.go.kr) | 서비스키 | 🔲 후보 | +| 부처/지자체 | 각 부처·지자체 게시판 | 다수 | HTML(GenericHtml) | 불필요 | 🔲 디스커버리 확장 | + +## 핵심 메모 + +- **커버리지**: 기업마당 + K-Startup 두 API 가 정부지원사업 공고의 대부분을 집계. + 기업마당 키 확보가 다음 우선순위. +- **키 체계 주의**: 기업마당은 data.go.kr 가 아니라 bizinfo.go.kr 자체 인증키(`crtfcKey`)를 쓴다. + data.go.kr 서비스키와 별개. 엔드포인트: `https://www.bizinfo.go.kr/uss/rss/bizinfoApi.do?crtfcKey=...&dataType=json` +- **HTML 확장**: 부처/지자체 게시판은 대부분 정적 렌더링 표(table)라 `GenericHtmlSource` + config 로 코드 수정 없이 추가 가능(mss 사례 참조). + +## 참고 링크 + +- 기업마당 API: https://www.bizinfo.go.kr/web/lay1/program/S1T175C174/apiList.do +- K-Startup API(data.go.kr): https://www.data.go.kr/data/15125364/openapi.do +- 중소벤처24: https://www.smes.go.kr/main/bizApply diff --git a/government/package-lock.json b/government/package-lock.json new file mode 100644 index 0000000..987755c --- /dev/null +++ b/government/package-lock.json @@ -0,0 +1,349 @@ +{ + "name": "gov-scraper", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "gov-scraper", + "version": "0.1.0", + "dependencies": { + "cheerio": "^1.0.0", + "dotenv": "^16.4.5", + "oracledb": "^6.5.1", + "playwright-core": "^1.49.0" + } + }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==", + "license": "ISC" + }, + "node_modules/cheerio": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.2.0.tgz", + "integrity": "sha512-WDrybc/gKFpTYQutKIK6UvfcuxijIZfMfXaYm8NMsPQxSYvf+13fXUJ4rztGGbJcBQ/GF55gvrZ0Bc0bj/mqvg==", + "license": "MIT", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.2.2", + "encoding-sniffer": "^0.2.1", + "htmlparser2": "^10.1.0", + "parse5": "^7.3.0", + "parse5-htmlparser2-tree-adapter": "^7.1.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^7.19.0", + "whatwg-mimetype": "^4.0.0" + }, + "engines": { + "node": ">=20.18.1" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-select": { + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz", + "integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz", + "integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "BSD-2-Clause" + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "license": "BSD-2-Clause", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "license": "BSD-2-Clause", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/dotenv": { + "version": "16.6.1", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz", + "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, + "node_modules/encoding-sniffer": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.1.tgz", + "integrity": "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==", + "license": "MIT", + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/htmlparser2": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.1.0.tgz", + "integrity": "sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.2.2", + "entities": "^7.0.1" + } + }, + "node_modules/htmlparser2/node_modules/entities": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-7.0.1.tgz", + "integrity": "sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, + "node_modules/oracledb": { + "version": "6.10.0", + "resolved": "https://registry.npmjs.org/oracledb/-/oracledb-6.10.0.tgz", + "integrity": "sha512-kGUumXmrEWbSpBuKJyb9Ip3rXcNgKK6grunI3/cLPzrRvboZ6ZoLi9JQ+z6M/RIG924tY8BLflihL4CKKQAYMA==", + "hasInstallScript": true, + "license": "(Apache-2.0 OR UPL-1.0)", + "engines": { + "node": ">=14.17" + } + }, + "node_modules/parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "license": "MIT", + "dependencies": { + "entities": "^6.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz", + "integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==", + "license": "MIT", + "dependencies": { + "domhandler": "^5.0.3", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "license": "MIT", + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5/node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/playwright-core": { + "version": "1.60.0", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.60.0.tgz", + "integrity": "sha512-9bW6zvX/m0lEbgTKJ6YppOKx8H3VOPBMOCFh2irXFOT4BbHgrx5hPjwJYLT40Lu+4qtD36qKc/Hn56StUW57IA==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "license": "MIT" + }, + "node_modules/undici": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.27.1.tgz", + "integrity": "sha512-UDdpiex+mzigiyrXrGbiUaF4HzTNhKbh2vRNFaTMzcqmLIPrZxaCtwo/1TMSuWoM1Xz3WiTo9KdgI3kRqYzJGg==", + "license": "MIT", + "engines": { + "node": ">=20.18.1" + } + }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "deprecated": "Use @exodus/bytes instead for a more spec-conformant and faster implementation", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "license": "MIT", + "engines": { + "node": ">=18" + } + } + } +} diff --git a/government/package.json b/government/package.json new file mode 100644 index 0000000..405ddcb --- /dev/null +++ b/government/package.json @@ -0,0 +1,19 @@ +{ + "name": "gov-scraper", + "version": "0.1.0", + "private": true, + "type": "module", + "description": "정부지원사업 공고 수집 데몬 (Open API 우선 + HTML 보조)", + "scripts": { + "daemon": "node src/daemon.js", + "run-once": "node src/cli.js run-once", + "test-db": "node src/cli.js test-db", + "test-crawl": "node src/cli.js test-crawl" + }, + "dependencies": { + "cheerio": "^1.0.0", + "dotenv": "^16.4.5", + "oracledb": "^6.5.1", + "playwright-core": "^1.49.0" + } +} diff --git a/government/src/bootstrap.js b/government/src/bootstrap.js new file mode 100644 index 0000000..c432fb2 --- /dev/null +++ b/government/src/bootstrap.js @@ -0,0 +1,19 @@ +// Oracle Instant Client(thick 모드)는 libnnz 등 의존 라이브러리를 LD_LIBRARY_PATH 로 찾는다. +// LD_LIBRARY_PATH 는 프로세스 시작 시점에만 읽히므로, 누락 시 동일 인자로 한 번 재실행한다. +// 진입점(daemon.js, cli.js) 최상단에서 가장 먼저 import 할 것. +import { spawnSync } from 'node:child_process'; + +const IC = process.env.ORACLE_IC_LIB_DIR || '/home/opc/oracle-ic/instantclient_23_26'; +const current = (process.env.LD_LIBRARY_PATH || '').split(':').filter(Boolean); + +if (!current.includes(IC)) { + const env = { + ...process.env, + LD_LIBRARY_PATH: [IC, ...current].join(':'), + }; + const result = spawnSync(process.execPath, process.argv.slice(1), { + stdio: 'inherit', + env, + }); + process.exit(result.status ?? 1); +} diff --git a/government/src/cli.js b/government/src/cli.js new file mode 100644 index 0000000..bcc7dfe --- /dev/null +++ b/government/src/cli.js @@ -0,0 +1,57 @@ +// 수동 실행 CLI. +// node src/cli.js test-db DB 접속 확인 +// node src/cli.js test-crawl 3단계 크롤러 단독 테스트 +// node src/cli.js run-once [sourceCode] 1회 수집 (코드 생략 시 전체) +import './bootstrap.js'; // LD_LIBRARY_PATH 보정 (가장 먼저) +import { log } from './logger.js'; +import { withConnection, closePool } from './db.js'; +import { crawl } from './crawler/crawler.js'; +import { disconnectBrowser } from './crawler/browser.js'; +import { availableSources, sourceByCode } from './sources/registry.js'; +import { runAll } from './pipeline.js'; + +async function cleanup() { + await disconnectBrowser(); + await closePool(); +} + +async function main() { + const [cmd, arg] = process.argv.slice(2); + switch (cmd) { + case 'test-db': { + const r = await withConnection((c) => + c.execute('SELECT COUNT(*) FROM gov_source') + ); + log.info('DB OK, gov_source 행수 =', r.rows[0][0]); + break; + } + case 'test-crawl': { + if (!arg) throw new Error('사용법: test-crawl '); + const text = await crawl(arg); + log.info(`크롤 결과 ${text.length}자:\n${text.slice(0, 500)}`); + break; + } + case 'run-once': { + const sources = arg + ? [sourceByCode(arg)].filter(Boolean) + : availableSources(); + if (sources.length === 0) { + throw new Error(arg ? `소스 없음/비활성: ${arg}` : '가용 소스 없음'); + } + const results = await runAll(sources); + log.info('수집 결과:', JSON.stringify(results)); + break; + } + default: + log.info('사용법: test-db | test-crawl | run-once [sourceCode]'); + } +} + +main() + .then(cleanup) + .then(() => process.exit(0)) + .catch(async (e) => { + log.error('CLI 오류:', e.stack || e.message); + await cleanup(); + process.exit(1); + }); diff --git a/government/src/config.js b/government/src/config.js new file mode 100644 index 0000000..4f3b1bd --- /dev/null +++ b/government/src/config.js @@ -0,0 +1,43 @@ +// 환경설정 로더. 프로젝트 루트(.env)를 읽어 데몬 전역 설정으로 노출한다. +import dotenv from 'dotenv'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const ROOT = path.resolve(__dirname, '..', '..'); // /home/opc/sundol + +dotenv.config({ path: path.join(ROOT, '.env') }); + +function required(name) { + const v = process.env[name]; + if (v === undefined || v === null || v === '') { + throw new Error(`필수 환경변수 누락: ${name}`); + } + return v; +} + +export const config = { + root: ROOT, + oracle: { + user: required('ORACLE_USERNAME'), + password: required('ORACLE_PASSWORD'), + connectString: required('ORACLE_TNS_NAME'), + walletPath: required('ORACLE_WALLET_PATH'), + // thick 모드: Instant Client 라이브러리 + sso 지갑을 읽을 net 설정 디렉터리 + icLibDir: process.env.ORACLE_IC_LIB_DIR || '/home/opc/oracle-ic/instantclient_23_26', + netConfigDir: + process.env.ORACLE_NET_CONFIG_DIR || + path.join(ROOT, 'government', 'oracle-net'), + }, + dataGoKr: { + apiKey: process.env.DATA_GO_KR_API_KEY || '', + }, + bizinfo: { + crtfcKey: process.env.BIZINFO_CRTFC_KEY || '', + }, + jina: { + apiKey: process.env.JINA_READER_API_KEY || '', + }, + cdpUrl: process.env.GOV_CDP_URL || 'http://127.0.0.1:9222', + pollIntervalMinutes: Number(process.env.GOV_POLL_INTERVAL_MINUTES || 60), +}; diff --git a/government/src/crawler/browser.js b/government/src/crawler/browser.js new file mode 100644 index 0000000..e4ec8b5 --- /dev/null +++ b/government/src/crawler/browser.js @@ -0,0 +1,68 @@ +// 기존 sundol-chrome(PM2, CDP 9222)에 연결해 새 탭을 여는 싱글톤. +// VNC 에서 사용자가 로그인한 세션을 그대로 사용하므로 봇 판정 우회에 유리하다. +// (백엔드 PlaywrightBrowserService 와 동일한 전략) +import { chromium } from 'playwright-core'; +import { config } from '../config.js'; +import { log } from '../logger.js'; + +let browser = null; + +async function ensureBrowser() { + if (browser && browser.isConnected()) return browser; + if (browser) { + try { + await browser.close(); + } catch { + // 끊긴 연결 정리 실패는 무시 가능 — 곧바로 재연결한다 + } + } + log.info(`Chrome CDP 연결 시도: ${config.cdpUrl}`); + browser = await chromium.connectOverCDP(config.cdpUrl); + log.info(`CDP 연결 완료: contexts=${browser.contexts().length}`); + return browser; +} + +function defaultContext(b) { + const contexts = b.contexts(); + if (contexts.length === 0) { + throw new Error('Chrome 에 활성 컨텍스트가 없습니다.'); + } + return contexts[0]; +} + +/** + * 새 탭을 열어 URL 로 이동한다. 호출자는 사용 후 반드시 closePage(page) 할 것. + */ +export async function openPage(url, { timeoutMs = 30_000, waitUntil = 'networkidle' } = {}) { + const b = await ensureBrowser(); + const ctx = defaultContext(b); + const page = await ctx.newPage(); + try { + await page.goto(url, { timeout: timeoutMs, waitUntil }); + return page; + } catch (e) { + await closePage(page); + throw new Error(`페이지 로드 실패 (${url}): ${e.message}`); + } +} + +export async function closePage(page) { + if (!page) return; + try { + await page.close(); + } catch (e) { + log.warn('탭 닫기 실패:', e.message); + } +} + +export async function disconnectBrowser() { + if (browser) { + try { + // CDP 연결만 해제 (Chrome 자체는 종료하지 않음) + await browser.close(); + } catch (e) { + log.warn('CDP 연결 해제 실패:', e.message); + } + browser = null; + } +} diff --git a/government/src/crawler/crawler.js b/government/src/crawler/crawler.js new file mode 100644 index 0000000..feb1dbf --- /dev/null +++ b/government/src/crawler/crawler.js @@ -0,0 +1,117 @@ +// 3단계 폴백 크롤러 (sundol WebCrawlerService 의 Node 재구현) +// 1차: 정적 fetch + cheerio 본문 추출 +// 2차: Jina Reader (r.jina.ai) +// 3차: Playwright (sundol-chrome CDP) 로 실제 렌더링 후 innerText +// Facade: 호출자는 crawl(url) 만 사용한다. +import * as cheerio from 'cheerio'; +import { config } from '../config.js'; +import { log } from '../logger.js'; +import { openPage, closePage } from './browser.js'; + +const JINA_BASE = 'https://r.jina.ai/'; +const MIN_CONTENT_LENGTH = 100; +const UA = + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'; +const ERROR_PATTERNS = [ + 'access denied', '403 forbidden', "you don't have permission", + 'error 403', 'error 401', 'unauthorized', 'captcha', + 'please enable javascript', 'checking your browser', + 'attention required', 'just a moment', + 'technical difficulty', 'page not found', '404 not found', +]; +const REMOVE_SELECTORS = 'nav, footer, header, script, style, .ad, #cookie-banner, .sidebar, .comments'; +const ARTICLE_SELECTORS = 'article, main, .post-content, .article-body, .entry-content'; + +function isValidContent(text) { + if (!text || text.length < MIN_CONTENT_LENGTH) return false; + const preview = text.slice(0, 500).toLowerCase(); + for (const pattern of ERROR_PATTERNS) { + if (preview.includes(pattern)) { + log.warn(`에러 페이지 패턴 감지: '${pattern}'`); + return false; + } + } + return true; +} + +async function crawlWithCheerio(url) { + log.info(`정적 크롤링(cheerio): ${url}`); + const res = await fetch(url, { + headers: { 'User-Agent': UA }, + redirect: 'follow', + signal: AbortSignal.timeout(15_000), + }); + if (!res.ok) throw new Error(`HTTP ${res.status}`); + const html = await res.text(); + const $ = cheerio.load(html); + $(REMOVE_SELECTORS).remove(); + const article = $(ARTICLE_SELECTORS).first(); + const text = (article.length ? article : $('body')).text().replace(/\s+\n/g, '\n').trim(); + log.info(`cheerio 추출: ${text.length} chars`); + return text; +} + +async function crawlWithJina(url) { + log.info(`Jina Reader 크롤링: ${url}`); + const headers = { Accept: 'text/plain' }; + if (config.jina.apiKey) headers.Authorization = `Bearer ${config.jina.apiKey}`; + const res = await fetch(JINA_BASE + url, { + headers, + signal: AbortSignal.timeout(30_000), + }); + if (!res.ok) throw new Error(`Jina HTTP ${res.status}`); + const text = await res.text(); + if (!text || !text.trim()) throw new Error('Jina Reader 빈 응답'); + log.info(`Jina 추출: ${text.length} chars`); + return text; +} + +async function crawlWithPlaywright(url) { + log.info(`Playwright 크롤링: ${url}`); + const page = await openPage(url); + try { + const text = await page.evaluate( + ({ removeSel, articleSel }) => { + removeSel.split(',').forEach((sel) => + document.querySelectorAll(sel.trim()).forEach((el) => el.remove()) + ); + const article = document.querySelector(articleSel); + return (article || document.body).innerText; + }, + { removeSel: REMOVE_SELECTORS, articleSel: ARTICLE_SELECTORS } + ); + if (!text || !text.trim()) throw new Error('Playwright 빈 본문'); + log.info(`Playwright 추출: ${text.length} chars`); + return text; + } finally { + await closePage(page); + } +} + +/** + * 본문 텍스트를 3단계 폴백으로 수집한다. 모두 실패하면 throw. + */ +export async function crawl(url) { + // 1차 + try { + const text = await crawlWithCheerio(url); + if (isValidContent(text)) return text; + log.warn(`cheerio 무효 콘텐츠(${text?.length || 0}자) → Jina 폴백`); + } catch (e) { + log.warn(`cheerio 실패(${url}): ${e.message} → Jina 폴백`); + } + // 2차 + try { + const text = await crawlWithJina(url); + if (isValidContent(text)) return text; + log.warn(`Jina 무효 콘텐츠(${text?.length || 0}자) → Playwright 폴백`); + } catch (e) { + log.warn(`Jina 실패(${url}): ${e.message} → Playwright 폴백`); + } + // 3차 + const text = await crawlWithPlaywright(url); + if (!isValidContent(text)) { + throw new Error(`모든 크롤링 방법 실패: ${url}`); + } + return text; +} diff --git a/government/src/daemon.js b/government/src/daemon.js new file mode 100644 index 0000000..2ea495a --- /dev/null +++ b/government/src/daemon.js @@ -0,0 +1,62 @@ +// 정부지원사업 수집 데몬. 주기적으로 가용 소스 전체를 1회 수집한다. +// PM2 로 상시 구동: pm2 start ecosystem.config.cjs --only gov-daemon +import './bootstrap.js'; // LD_LIBRARY_PATH 보정 (가장 먼저) +import { config } from './config.js'; +import { log } from './logger.js'; +import { closePool } from './db.js'; +import { disconnectBrowser } from './crawler/browser.js'; +import { availableSources } from './sources/registry.js'; +import { runAll } from './pipeline.js'; + +let stopping = false; + +function sleep(ms) { + return new Promise((r) => setTimeout(r, ms)); +} + +async function cycle() { + const sources = availableSources(); + if (sources.length === 0) { + log.warn('가용 소스가 없습니다. (서비스키/설정 확인)'); + return; + } + log.info(`수집 사이클 시작: 소스 ${sources.length}개 [${sources.map((s) => s.code).join(', ')}]`); + const results = await runAll(sources); + log.info('수집 사이클 종료:', JSON.stringify(results)); +} + +async function shutdown(signal) { + if (stopping) return; + stopping = true; + log.info(`${signal} 수신 — 데몬 종료 중`); + try { + await disconnectBrowser(); + await closePool(); + } catch (e) { + log.warn('종료 정리 중 오류:', e.message); + } + process.exit(0); +} + +process.on('SIGINT', () => shutdown('SIGINT')); +process.on('SIGTERM', () => shutdown('SIGTERM')); + +async function main() { + const intervalMs = Math.max(1, config.pollIntervalMinutes) * 60_000; + log.info(`gov-daemon 시작. 폴링 주기 ${config.pollIntervalMinutes}분.`); + while (!stopping) { + try { + await cycle(); + } catch (e) { + log.error('수집 사이클 오류:', e.stack || e.message); + } + if (stopping) break; + log.info(`다음 사이클까지 ${config.pollIntervalMinutes}분 대기`); + await sleep(intervalMs); + } +} + +main().catch(async (e) => { + log.error('데몬 치명적 오류:', e.stack || e.message); + await shutdown('FATAL'); +}); diff --git a/government/src/db.js b/government/src/db.js new file mode 100644 index 0000000..43354ca --- /dev/null +++ b/government/src/db.js @@ -0,0 +1,60 @@ +// Oracle Autonomous DB 접속 (node-oracledb thick 모드). +// Instant Client + sso 지갑(cwallet.sso)을 사용하므로 지갑 비밀번호가 필요 없다. +// (백엔드 JDBC 와 동일하게 자동로그인 지갑을 재사용) +import oracledb from 'oracledb'; +import { config } from './config.js'; +import { log } from './logger.js'; + +oracledb.fetchAsString = [oracledb.CLOB]; +oracledb.autoCommit = false; + +let pool = null; +let clientInitialized = false; + +function initClient() { + if (clientInitialized) return; + oracledb.initOracleClient({ + libDir: config.oracle.icLibDir, + configDir: config.oracle.netConfigDir, // tnsnames.ora + WALLET_LOCATION 보정 sqlnet.ora + }); + clientInitialized = true; +} + +export async function initPool() { + if (pool) return pool; + initClient(); + pool = await oracledb.createPool({ + user: config.oracle.user, + password: config.oracle.password, + connectString: config.oracle.connectString, + poolMin: 1, + poolMax: 4, + poolIncrement: 1, + }); + log.info('Oracle 풀 생성 완료'); + return pool; +} + +export async function withConnection(fn) { + if (!pool) await initPool(); + const conn = await pool.getConnection(); + try { + return await fn(conn); + } finally { + try { + await conn.close(); + } catch (e) { + log.warn('연결 반환 실패:', e.message); + } + } +} + +export async function closePool() { + if (pool) { + await pool.close(10); + pool = null; + log.info('Oracle 풀 종료'); + } +} + +export { oracledb }; diff --git a/government/src/logger.js b/government/src/logger.js new file mode 100644 index 0000000..d0f699f --- /dev/null +++ b/government/src/logger.js @@ -0,0 +1,10 @@ +// 간단한 타임스탬프 로거. PM2 로그로 그대로 흘러간다. +function ts() { + return new Date().toISOString(); +} + +export const log = { + info: (...args) => console.log(`[${ts()}] [INFO]`, ...args), + warn: (...args) => console.warn(`[${ts()}] [WARN]`, ...args), + error: (...args) => console.error(`[${ts()}] [ERROR]`, ...args), +}; diff --git a/government/src/pipeline.js b/government/src/pipeline.js new file mode 100644 index 0000000..ee72fd9 --- /dev/null +++ b/government/src/pipeline.js @@ -0,0 +1,91 @@ +// 수집 파이프라인: 소스별로 목록 수집 → 적재 → 상세 본문 수집. +import { + ensureSource, + upsertOpportunities, + findPendingDetail, + saveDetail, + markDetailError, + markSourceCrawled, +} from './store/opportunityStore.js'; +import { log } from './logger.js'; + +const DETAIL_BATCH = Number(process.env.GOV_DETAIL_BATCH || 200); // 한 사이클에 상세 수집할 최대 건수(소스당) +const DETAIL_DELAY_MS = 300; // 상세 수집 간 간격(서버 부담 완화) + +function sleep(ms) { + return new Promise((r) => setTimeout(r, ms)); +} + +/** + * 단일 소스 1회 수집. + */ +export async function runSource(source) { + const startedAt = Date.now(); + log.info(`==== 소스 수집 시작: ${source.code} (${source.name}) ====`); + + // 1) 소스 등록/갱신 + const { id: sourceId, active } = await ensureSource(source.meta()); + if (!active) { + log.warn(`소스 비활성 상태(DB active=0): ${source.code} — 건너뜀`); + return { source: source.code, skipped: true }; + } + + // 2) 목록 수집 → 적재 + const items = await source.list(); + log.info(`${source.code}: 목록 ${items.length}건 수집`); + const upsert = await upsertOpportunities(sourceId, source.code, items); + log.info( + `${source.code}: 적재 처리=${upsert.processed} 신규=${upsert.inserted} 갱신=${upsert.updated}` + ); + + // 3) 상세 본문 수집 (LISTED 상태만) + const pending = await findPendingDetail(source.code, DETAIL_BATCH); + log.info(`${source.code}: 상세 수집 대상 ${pending.length}건`); + let detailOk = 0; + let detailErr = 0; + for (const row of pending) { + try { + const body = await source.fetchDetail(row); + if (!body || !body.trim()) { + throw new Error('빈 본문'); + } + await saveDetail(row.id, body); + detailOk += 1; + } catch (e) { + log.warn(`${source.code}/${row.externalId} 상세 실패: ${e.message}`); + await markDetailError(row.id); + detailErr += 1; + } + if (DETAIL_DELAY_MS > 0) await sleep(DETAIL_DELAY_MS); + } + + await markSourceCrawled(sourceId); + const elapsed = ((Date.now() - startedAt) / 1000).toFixed(1); + log.info( + `==== 소스 완료: ${source.code} | 신규 ${upsert.inserted} 갱신 ${upsert.updated} | 상세 OK ${detailOk} 실패 ${detailErr} | ${elapsed}s ====` + ); + return { + source: source.code, + listed: items.length, + inserted: upsert.inserted, + updated: upsert.updated, + detailOk, + detailErr, + }; +} + +/** + * 모든 가용 소스 1회 수집. + */ +export async function runAll(sources) { + const results = []; + for (const source of sources) { + try { + results.push(await runSource(source)); + } catch (e) { + log.error(`소스 ${source.code} 수집 중 오류: ${e.message}`); + results.push({ source: source.code, error: e.message }); + } + } + return results; +} diff --git a/government/src/sources/base.js b/government/src/sources/base.js new file mode 100644 index 0000000..e2d3ac5 --- /dev/null +++ b/government/src/sources/base.js @@ -0,0 +1,62 @@ +// OpportunitySource — Strategy 인터페이스. +// 소스(사이트)별 어댑터는 이 클래스를 상속해 list()/fetchDetail() 을 구현한다. +import { crawl } from '../crawler/crawler.js'; + +/** + * 공고 목록 항목 형태: + * { + * externalId: string, // 소스 고유 키 (필수, dedup) + * title: string, // 제목 (필수) + * agency?: string, // 소관/주관기관 + * category?: string, // 지원분야 + * target?: string, // 지원대상 + * applyStart?: Date, // 접수 시작 + * applyEnd?: Date, // 접수 마감 + * detailUrl?: string, // 상세 페이지 URL + * raw?: object, // 원본 데이터(JSON 저장) + * } + */ +export class OpportunitySource { + /** @param {{code:string,name:string,baseUrl?:string,type:'API'|'HTML',config?:object}} meta */ + constructor(meta) { + if (!meta.code || !meta.name || !meta.type) { + throw new Error('OpportunitySource meta 에 code/name/type 필수'); + } + this.code = meta.code; + this.name = meta.name; + this.baseUrl = meta.baseUrl || null; + this.type = meta.type; + this.config = meta.config || {}; + } + + meta() { + return { + code: this.code, + name: this.name, + baseUrl: this.baseUrl, + type: this.type, + config: this.config, + }; + } + + /** + * 공고 목록을 수집한다. 하위 클래스에서 반드시 구현. + * @returns {Promise} + */ + async list() { + throw new Error(`${this.code}: list() 미구현`); + } + + /** + * 상세 본문을 수집한다. 기본 구현은 detailUrl 을 3단계 폴백 크롤러로 긁는다. + * API 처럼 본문이 이미 raw 에 있는 소스는 이 메서드를 오버라이드한다. + * @param {{id:string, externalId:string, detailUrl:string, raw:object|null}} row + * @returns {Promise} 본문 텍스트 + */ + async fetchDetail(row) { + if (!row.detailUrl) { + throw new Error(`${this.code}/${row.externalId}: detailUrl 없음 — 상세 수집 불가`); + } + return crawl(row.detailUrl); + } +} diff --git a/government/src/sources/genericHtml.js b/government/src/sources/genericHtml.js new file mode 100644 index 0000000..44cfba0 --- /dev/null +++ b/government/src/sources/genericHtml.js @@ -0,0 +1,126 @@ +// GenericHtmlSource — 표(table) 기반 게시판형 공고 목록을 config 로 수집하는 범용 HTML 어댑터. +// 새 HTML 사이트는 코드 수정 없이 config 만 바꿔 추가할 수 있다(Strategy + 설정 주입). +// +// config 예시: +// { +// listUrl: 'https://.../List.do?cbIdx=310', +// pageParam: 'pageIndex', // 페이지 쿼리 파라미터 (없으면 단일 페이지) +// maxPages: 5, +// rowSelector: 'table tbody tr', +// title: { selector: 'td.subject a', attr: 'title' }, // attr 생략 시 text() +// externalId: { from: 'onclick', regex: "doBbsFView\\('\\d+','(\\d+)'" }, +// detailUrl: { template: 'https://.../View.do?cbIdx=310&bcIdx={id}&parentSeq={id}' }, +// agency: '중소벤처기업부', // 정적 소관기관(선택) +// } +import * as cheerio from 'cheerio'; +import { OpportunitySource } from './base.js'; +import { log } from '../logger.js'; +import { decodeEntities, nonEmpty } from '../util.js'; + +const UA = + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'; + +export class GenericHtmlSource extends OpportunitySource { + constructor(meta) { + super({ ...meta, type: 'HTML' }); + const c = this.config; + if (!c.listUrl || !c.rowSelector || !c.externalId || !c.detailUrl) { + throw new Error( + `${this.code}: config 에 listUrl/rowSelector/externalId/detailUrl 필수` + ); + } + } + + #pageUrl(page) { + const c = this.config; + if (!c.pageParam) return c.listUrl; + const url = new URL(c.listUrl); + url.searchParams.set(c.pageParam, String(page)); + return url.toString(); + } + + async #fetchHtml(url) { + const res = await fetch(url, { + headers: { 'User-Agent': UA }, + redirect: 'follow', + signal: AbortSignal.timeout(20_000), + }); + if (!res.ok) throw new Error(`HTTP ${res.status} (${url})`); + return res.text(); + } + + #extractField($, row, spec) { + if (!spec) return null; + let el = spec.selector ? row.find(spec.selector).first() : row; + if (el.length === 0) return null; + let val; + if (spec.attr) val = el.attr(spec.attr); + else val = el.text(); + return decodeEntities(nonEmpty(val)); + } + + #extractByRegex(text, pattern) { + if (!text || !pattern) return null; + const m = new RegExp(pattern).exec(text); + return m ? m[1] : null; + } + + #mapRow($, el) { + const c = this.config; + const row = $(el); + + // externalId: onclick / href / 선택자 텍스트에서 정규식 추출 + let idSource; + if (c.externalId.from === 'onclick') idSource = row.attr('onclick') || row.find('[onclick]').first().attr('onclick'); + else if (c.externalId.from === 'href') idSource = row.find('a').first().attr('href'); + else idSource = this.#extractField($, row, c.externalId); + const externalId = c.externalId.regex + ? this.#extractByRegex(idSource, c.externalId.regex) + : nonEmpty(idSource); + if (!externalId) return null; // 헤더행 등은 스킵 + + const title = this.#extractField($, row, c.title); + if (!title) return null; + + const detailUrl = c.detailUrl.template + ? c.detailUrl.template.replace(/\{id\}/g, externalId) + : this.#extractField($, row, c.detailUrl); + + return { + externalId, + title, + agency: c.agency || this.#extractField($, row, c.agencyField) || null, + category: this.#extractField($, row, c.categoryField), + target: null, + applyStart: null, + applyEnd: null, + detailUrl, + raw: { onclick: row.attr('onclick') || null, title }, + }; + } + + async list() { + const c = this.config; + const maxPages = c.maxPages || 1; + const out = []; + const seen = new Set(); + for (let page = 1; page <= maxPages; page += 1) { + const url = this.#pageUrl(page); + const html = await this.#fetchHtml(url); + const $ = cheerio.load(html); + const rows = $(c.rowSelector); + let pageCount = 0; + rows.each((_, el) => { + const item = this.#mapRow($, el); + if (item && !seen.has(item.externalId)) { + seen.add(item.externalId); + out.push(item); + pageCount += 1; + } + }); + log.info(`${this.code} page ${page}: ${pageCount}건`); + if (pageCount === 0) break; // 더 이상 행이 없으면 종료 + } + return out; + } +} diff --git a/government/src/sources/htmlSources.js b/government/src/sources/htmlSources.js new file mode 100644 index 0000000..4c3bf65 --- /dev/null +++ b/government/src/sources/htmlSources.js @@ -0,0 +1,28 @@ +// config 로 정의되는 HTML 게시판 소스 목록. +// 새 사이트는 여기 항목을 추가하면 된다(코드 로직 수정 불필요). +import { GenericHtmlSource } from './genericHtml.js'; + +export const HTML_SOURCE_CONFIGS = [ + { + code: 'mss', + name: '중소벤처기업부 사업공고', + baseUrl: 'https://www.mss.go.kr', + config: { + listUrl: 'https://www.mss.go.kr/site/smba/ex/bbs/List.do?cbIdx=310', + pageParam: 'pageIndex', + maxPages: 3, + rowSelector: 'table tbody tr', + title: { selector: 'td.subject a', attr: 'title' }, + externalId: { from: 'onclick', regex: "doBbsFView\\('\\d+','(\\d+)'" }, + detailUrl: { + template: + 'https://www.mss.go.kr/site/smba/ex/bbs/View.do?cbIdx=310&bcIdx={id}&parentSeq={id}', + }, + agency: '중소벤처기업부', + }, + }, +]; + +export function buildHtmlSources() { + return HTML_SOURCE_CONFIGS.map((cfg) => new GenericHtmlSource(cfg)); +} diff --git a/government/src/sources/kstartup.js b/government/src/sources/kstartup.js new file mode 100644 index 0000000..b47c179 --- /dev/null +++ b/government/src/sources/kstartup.js @@ -0,0 +1,92 @@ +// K-Startup 창업지원 사업공고 Open API 어댑터 (data.go.kr 서비스키 사용). +// 엔드포인트: getAnnouncementInformation (페이지네이션) +import { OpportunitySource } from './base.js'; +import { config } from '../config.js'; +import { log } from '../logger.js'; +import { decodeEntities, parseYmd, nonEmpty } from '../util.js'; + +const ENDPOINT = + 'https://nidapi.k-startup.go.kr/api/kisedKstartupService/v1/getAnnouncementInformation'; +const PER_PAGE = 100; +const MAX_PAGES = Number(process.env.GOV_KSTARTUP_MAX_PAGES || 400); // 안전 상한(약 4만건) + +export class KStartupApiSource extends OpportunitySource { + constructor() { + super({ + code: 'kstartup', + name: 'K-Startup 창업지원 공고', + baseUrl: 'https://www.k-startup.go.kr', + type: 'API', + config: { endpoint: ENDPOINT, perPage: PER_PAGE }, + }); + } + + static isAvailable() { + return Boolean(config.dataGoKr.apiKey); + } + + async #fetchPage(page) { + const url = new URL(ENDPOINT); + url.searchParams.set('serviceKey', config.dataGoKr.apiKey); + url.searchParams.set('page', String(page)); + url.searchParams.set('perPage', String(PER_PAGE)); + url.searchParams.set('returnType', 'json'); + const res = await fetch(url, { signal: AbortSignal.timeout(30_000) }); + if (!res.ok) { + throw new Error(`K-Startup API HTTP ${res.status}: ${(await res.text()).slice(0, 200)}`); + } + const json = await res.json(); + if (!Array.isArray(json.data)) { + throw new Error(`K-Startup API 응답 형식 오류: ${JSON.stringify(json).slice(0, 200)}`); + } + return json; + } + + // API 가 제공하는 필드들로 본문을 조립한다 (별도 상세 크롤링 불필요). + #buildBody(item) { + const parts = []; + const content = decodeEntities(nonEmpty(item.pbanc_ctnt)); + if (content) parts.push(content); + const target = decodeEntities(nonEmpty(item.aply_trgt_ctnt)); + if (target) parts.push(`[지원대상]\n${target}`); + const exclude = decodeEntities(nonEmpty(item.aply_excl_trgt_ctnt)); + if (exclude) parts.push(`[제외대상]\n${exclude}`); + const online = nonEmpty(item.aply_mthd_onli_rcpt_istc); + if (online) parts.push(`[온라인 접수]\n${online}`); + const guide = nonEmpty(item.biz_gdnc_url); + if (guide) parts.push(`[안내 URL]\n${guide}`); + return parts.join('\n\n'); + } + + #map(item) { + const externalId = item.pbanc_sn != null ? String(item.pbanc_sn) : null; + const title = decodeEntities(item.biz_pbanc_nm); + if (!externalId || !title) { + throw new Error(`K-Startup 항목 필수필드 누락: ${JSON.stringify(item).slice(0, 200)}`); + } + return { + externalId, + title, + agency: decodeEntities(nonEmpty(item.pbanc_ntrp_nm) || nonEmpty(item.sprv_inst)), + category: decodeEntities(nonEmpty(item.supt_biz_clsfc)), + target: decodeEntities(nonEmpty(item.aply_trgt_ctnt) || nonEmpty(item.aply_trgt)), + applyStart: parseYmd(item.pbanc_rcpt_bgng_dt), + applyEnd: parseYmd(item.pbanc_rcpt_end_dt), + detailUrl: nonEmpty(item.detl_pg_url), + body: this.#buildBody(item), // 목록 단계에서 본문까지 적재 + raw: item, + }; + } + + async list() { + const out = []; + for (let page = 1; page <= MAX_PAGES; page += 1) { + const json = await this.#fetchPage(page); + const rows = json.data; + log.info(`K-Startup page ${page}: ${rows.length}건 (totalCount=${json.totalCount})`); + for (const item of rows) out.push(this.#map(item)); + if (rows.length < PER_PAGE) break; // 마지막 페이지 + } + return out; + } +} diff --git a/government/src/sources/registry.js b/government/src/sources/registry.js new file mode 100644 index 0000000..76305fd --- /dev/null +++ b/government/src/sources/registry.js @@ -0,0 +1,32 @@ +// 사용 가능한 소스 어댑터 레지스트리. +// 키(서비스키 등)가 없는 소스는 자동 제외한다. +import { KStartupApiSource } from './kstartup.js'; +import { buildHtmlSources } from './htmlSources.js'; +import { log } from '../logger.js'; + +// 키/설정 가용성 검사가 있는 API 소스 클래스들 +const API_SOURCE_CLASSES = [KStartupApiSource]; + +/** + * 현재 환경에서 사용 가능한 소스 인스턴스 목록. + */ +export function availableSources() { + const out = []; + for (const Cls of API_SOURCE_CLASSES) { + if (typeof Cls.isAvailable === 'function' && !Cls.isAvailable()) { + log.warn(`소스 비활성(키/설정 없음): ${Cls.name}`); + continue; + } + out.push(new Cls()); + } + // config 기반 HTML 소스(항상 가용) + out.push(...buildHtmlSources()); + return out; +} + +/** + * 특정 code 의 소스 하나만 가져온다(수동 실행용). 없으면 null. + */ +export function sourceByCode(code) { + return availableSources().find((s) => s.code === code) || null; +} diff --git a/government/src/store/opportunityStore.js b/government/src/store/opportunityStore.js new file mode 100644 index 0000000..0d5cfeb --- /dev/null +++ b/government/src/store/opportunityStore.js @@ -0,0 +1,197 @@ +// gov_source / gov_opportunity 적재 로직. 중복 제거는 (source_code, external_id) 유니크 키로 한다. +import { withConnection, oracledb } from '../db.js'; +import { log } from '../logger.js'; + +function clobBind(val) { + return { dir: oracledb.BIND_IN, type: oracledb.DB_TYPE_CLOB, val: val ?? null }; +} + +/** + * 소스를 upsert 하고 RAWTOHEX(id) 를 반환한다. + */ +export async function ensureSource({ code, name, baseUrl, type, config }) { + return withConnection(async (conn) => { + await conn.execute( + `MERGE INTO gov_source t + USING (SELECT :code AS code FROM dual) s + ON (t.code = s.code) + WHEN MATCHED THEN UPDATE SET + name = :name, base_url = :baseUrl, type = :type, + config = :config, updated_at = SYSTIMESTAMP + WHEN NOT MATCHED THEN INSERT (id, code, name, base_url, type, config, active, created_at, updated_at) + VALUES (SYS_GUID(), :code, :name, :baseUrl, :type, :config, 1, SYSTIMESTAMP, SYSTIMESTAMP)`, + { + code, + name, + baseUrl: baseUrl ?? null, + type, + config: clobBind(config ? JSON.stringify(config) : null), + } + ); + await conn.commit(); + const r = await conn.execute( + `SELECT RAWTOHEX(id) AS id, active FROM gov_source WHERE code = :code`, + { code } + ); + return { id: r.rows[0][0], active: r.rows[0][1] === 1 }; + }); +} + +/** + * 활성 소스 목록을 반환한다. + */ +export async function listActiveSources() { + return withConnection(async (conn) => { + const r = await conn.execute( + `SELECT RAWTOHEX(id) AS id, code, name, base_url, type, config + FROM gov_source WHERE active = 1 ORDER BY code`, + {}, + { outFormat: oracledb.OUT_FORMAT_OBJECT } + ); + return r.rows.map((row) => ({ + id: row.ID, + code: row.CODE, + name: row.NAME, + baseUrl: row.BASE_URL, + type: row.TYPE, + config: row.CONFIG ? JSON.parse(row.CONFIG) : {}, + })); + }); +} + +/** + * 목록 단계 공고들을 dedup-merge 한다. 기존 행의 본문/상세 상태는 보존한다. + * @returns {{inserted:number, updated:number}} + */ +export async function upsertOpportunities(sourceIdHex, sourceCode, items) { + if (!items || items.length === 0) return { inserted: 0, updated: 0 }; + return withConnection(async (conn) => { + let inserted = 0; + let updated = 0; + // 신규/갱신 판별을 위해 기존 external_id 를 한 번에 로드(행당 SELECT 제거). + const existing = new Set(); + { + const r = await conn.execute( + `SELECT external_id FROM gov_opportunity WHERE source_code = :sc`, + { sc: sourceCode } + ); + for (const row of r.rows) existing.add(String(row[0])); + } + for (const it of items) { + if (!it.externalId || !it.title) { + throw new Error( + `필수 필드 누락 (externalId/title): ${JSON.stringify(it).slice(0, 200)}` + ); + } + const isNew = !existing.has(String(it.externalId)); + const hasBody = it.body && it.body.trim() ? 1 : 0; + // body 가 있으면(API 처럼) 목록 단계에서 바로 본문 저장 → 상태 DETAILED. + // 기존 행 갱신 시 body 가 없으면 기존 본문/상태를 보존한다. + await conn.execute( + `MERGE INTO gov_opportunity t + USING (SELECT :sourceCode AS source_code, :externalId AS external_id FROM dual) s + ON (t.source_code = s.source_code AND t.external_id = s.external_id) + WHEN MATCHED THEN UPDATE SET + title = :title, agency = :agency, category = :category, target = :target, + apply_start = :applyStart, apply_end = :applyEnd, detail_url = :detailUrl, + raw_json = :rawJson, + body_text = CASE WHEN :hasBody = 1 THEN :body ELSE body_text END, + status = CASE WHEN :hasBody = 1 THEN 'DETAILED' ELSE status END, + detail_collected_at = CASE WHEN :hasBody = 1 THEN SYSTIMESTAMP ELSE detail_collected_at END, + updated_at = SYSTIMESTAMP + WHEN NOT MATCHED THEN INSERT + (id, source_id, source_code, external_id, title, agency, category, target, + apply_start, apply_end, detail_url, raw_json, body_text, status, + list_collected_at, detail_collected_at, created_at, updated_at) + VALUES (SYS_GUID(), HEXTORAW(:sourceId), :sourceCode, :externalId, :title, :agency, + :category, :target, :applyStart, :applyEnd, :detailUrl, :rawJson, :body, + CASE WHEN :hasBody = 1 THEN 'DETAILED' ELSE 'LISTED' END, + SYSTIMESTAMP, CASE WHEN :hasBody = 1 THEN SYSTIMESTAMP ELSE NULL END, + SYSTIMESTAMP, SYSTIMESTAMP)`, + { + sourceId: sourceIdHex, + sourceCode, + externalId: String(it.externalId), + title: it.title.slice(0, 1000), + agency: it.agency ? it.agency.slice(0, 300) : null, + category: it.category ? it.category.slice(0, 200) : null, + target: it.target ? it.target.slice(0, 1000) : null, + applyStart: it.applyStart ?? null, + applyEnd: it.applyEnd ?? null, + detailUrl: it.detailUrl ? it.detailUrl.slice(0, 1000) : null, + rawJson: clobBind(it.raw ? JSON.stringify(it.raw) : null), + body: clobBind(hasBody ? it.body : null), + hasBody, + } + ); + if (isNew) inserted += 1; + else updated += 1; + } + await conn.commit(); + return { processed: items.length, inserted, updated }; + }); +} + +/** + * 상세 본문 미수집(LISTED) 공고를 가져온다. + */ +export async function findPendingDetail(sourceCode, limit) { + return withConnection(async (conn) => { + const r = await conn.execute( + `SELECT RAWTOHEX(id) AS id, external_id, detail_url + FROM gov_opportunity + WHERE source_code = :sourceCode AND status = 'LISTED' AND detail_url IS NOT NULL + ORDER BY created_at + FETCH FIRST :lim ROWS ONLY`, + { sourceCode, lim: limit }, + { outFormat: oracledb.OUT_FORMAT_OBJECT } + ); + return r.rows.map((row) => ({ + id: row.ID, + externalId: row.EXTERNAL_ID, + detailUrl: row.DETAIL_URL, + })); + }); +} + +/** + * 상세 본문을 저장하고 상태를 DETAILED 로 갱신한다. + */ +export async function saveDetail(idHex, bodyText) { + return withConnection(async (conn) => { + await conn.execute( + `UPDATE gov_opportunity + SET body_text = :body, status = 'DETAILED', + detail_collected_at = SYSTIMESTAMP, updated_at = SYSTIMESTAMP + WHERE id = HEXTORAW(:id)`, + { body: clobBind(bodyText), id: idHex } + ); + await conn.commit(); + }); +} + +/** + * 상세 수집 실패 표시. + */ +export async function markDetailError(idHex) { + return withConnection(async (conn) => { + await conn.execute( + `UPDATE gov_opportunity + SET status = 'ERROR', updated_at = SYSTIMESTAMP + WHERE id = HEXTORAW(:id)`, + { id: idHex } + ); + await conn.commit(); + }); +} + +export async function markSourceCrawled(sourceIdHex) { + return withConnection(async (conn) => { + await conn.execute( + `UPDATE gov_source SET last_crawled_at = SYSTIMESTAMP, updated_at = SYSTIMESTAMP + WHERE id = HEXTORAW(:id)`, + { id: sourceIdHex } + ); + await conn.commit(); + }); +} diff --git a/government/src/util.js b/government/src/util.js new file mode 100644 index 0000000..5f4ec04 --- /dev/null +++ b/government/src/util.js @@ -0,0 +1,34 @@ +// 공용 유틸: HTML 엔티티 디코드, YYYYMMDD 날짜 파싱. + +const ENTITIES = { + '&': '&', '<': '<', '>': '>', '"': '"', + ''': "'", ''': "'", ' ': ' ', +}; + +export function decodeEntities(s) { + if (s == null) return null; + return String(s) + .replace(/&|<|>|"|'|'| /g, (m) => ENTITIES[m]) + .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n))) + .trim(); +} + +/** + * 'YYYYMMDD' 또는 'YYYY-MM-DD' 를 Date 로. 형식 불일치면 null. + */ +export function parseYmd(s) { + if (s == null) return null; + const digits = String(s).replace(/[^0-9]/g, ''); + if (digits.length !== 8) return null; + const y = Number(digits.slice(0, 4)); + const m = Number(digits.slice(4, 6)); + const d = Number(digits.slice(6, 8)); + if (m < 1 || m > 12 || d < 1 || d > 31) return null; + return new Date(Date.UTC(y, m - 1, d)); +} + +export function nonEmpty(s) { + if (s == null) return null; + const t = String(s).trim(); + return t === '' ? null : t; +}