gov-scraper: 중소벤처24(smes) 사업공고 소스 추가

- GenericHtmlSource 확장: 신청기간(period) 날짜 파싱, listOnly(목록 전용) 모드
- smes(중소벤처24 bizApply) config 추가 — href의 PBLN 공고ID 추출, 제목/분야/주관기관/신청기간 적재
- smes 상세는 팝업 전용(JS 다이얼로그)이라 직접 크롤 불가 → 목록 전용으로 적재(18건 검증)
- util: parseFlexibleDate(YY-MM-DD/YYYYMMDD 대응)
- pipeline: skipDetail 소스는 상세 단계 건너뜀

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-10 05:51:46 +00:00
parent cbc5ba5663
commit f2a8f30867
7 changed files with 99 additions and 13 deletions

View File

@@ -27,6 +27,8 @@ export class OpportunitySource {
this.baseUrl = meta.baseUrl || null;
this.type = meta.type;
this.config = meta.config || {};
// true 면 파이프라인이 상세 본문 수집 단계를 건너뛴다(목록 전용 소스).
this.skipDetail = false;
}
meta() {

View File

@@ -15,7 +15,7 @@
import * as cheerio from 'cheerio';
import { OpportunitySource } from './base.js';
import { log } from '../logger.js';
import { decodeEntities, nonEmpty } from '../util.js';
import { decodeEntities, nonEmpty, parseFlexibleDate } from '../util.js';
const UA =
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
@@ -24,11 +24,11 @@ export class GenericHtmlSource extends OpportunitySource {
constructor(meta) {
super({ ...meta, type: 'HTML' });
const c = this.config;
if (!c.listUrl || !c.rowSelector || !c.externalId || !c.detailUrl) {
throw new Error(
`${this.code}: config 에 listUrl/rowSelector/externalId/detailUrl 필수`
);
if (!c.listUrl || !c.rowSelector || !c.externalId) {
throw new Error(`${this.code}: config 에 listUrl/rowSelector/externalId 필수`);
}
// listOnly: 목록만 적재하고 상세 본문 수집은 건너뛴다(상세가 팝업/JS 라 직접 크롤 불가한 사이트).
this.skipDetail = c.listOnly === true;
}
#pageUrl(page) {
@@ -82,9 +82,25 @@ export class GenericHtmlSource extends OpportunitySource {
const title = this.#extractField($, row, c.title);
if (!title) return null;
const detailUrl = c.detailUrl.template
? c.detailUrl.template.replace(/\{id\}/g, externalId)
: this.#extractField($, row, c.detailUrl);
let detailUrl = null;
if (c.detailUrl) {
detailUrl = c.detailUrl.template
? c.detailUrl.template.replace(/\{id\}/g, externalId)
: this.#extractField($, row, c.detailUrl);
}
// 신청기간 "26-06-10 ~ 26-06-24" → applyStart/applyEnd
let applyStart = null;
let applyEnd = null;
if (c.period) {
const periodText = this.#extractField($, row, c.period);
if (periodText) {
const sep = c.period.sep || '~';
const segs = periodText.split(sep).map((x) => x.trim());
applyStart = parseFlexibleDate(segs[0]);
applyEnd = parseFlexibleDate(segs[1] || segs[0]);
}
}
return {
externalId,
@@ -92,8 +108,8 @@ export class GenericHtmlSource extends OpportunitySource {
agency: c.agency || this.#extractField($, row, c.agencyField) || null,
category: this.#extractField($, row, c.categoryField),
target: null,
applyStart: null,
applyEnd: null,
applyStart,
applyEnd,
detailUrl,
raw: { onclick: row.attr('onclick') || null, title },
};

View File

@@ -21,6 +21,28 @@ export const HTML_SOURCE_CONFIGS = [
agency: '중소벤처기업부',
},
},
{
code: 'smes',
name: '중소벤처24 사업공고',
baseUrl: 'https://www.smes.go.kr',
config: {
listUrl: 'https://www.smes.go.kr/main/bizApply',
maxPages: 1, // 최신 목록(ajax 페이징이라 1페이지). 데몬이 주기적으로 신규 포착.
rowSelector: 'table tbody tr',
// 행 앵커: javascript:fn_include_popOpen2('seq','idx','cd','PBLN_...','기관','상태')
externalId: { from: 'href', regex: '(PBLN_\\d+)' },
title: { selector: 'td:nth-child(2) a' },
categoryField: { selector: 'td:nth-child(5)' },
agencyField: { selector: 'td:nth-child(6)' },
period: { selector: 'td:nth-child(3)', sep: '~' },
// 상세는 팝업/JS(다이얼로그)라 직접 크롤 불가 → 목록 전용. URL은 참조용으로만 저장.
detailUrl: {
template:
'https://www.smes.go.kr/sii/siia/selectSIIA200Detail.do?pblancId={id}',
},
listOnly: true,
},
},
];
export function buildHtmlSources() {