From db1e65bee54c3a88722d61ecf57379ff7ce34ff2 Mon Sep 17 00:00:00 2001 From: revfactory Date: Thu, 28 May 2026 20:37:36 +0900 Subject: [PATCH 1/5] =?UTF-8?q?docs:=20PDF=20=EC=9E=84=ED=8F=AC=ED=8A=B8(P?= =?UTF-8?q?hase=202)=20=EC=8A=A4=ED=8E=99=20=EC=B6=94=EA=B0=80=20=EB=B0=8F?= =?UTF-8?q?=20pdfjs-dist=20=EC=9D=98=EC=A1=B4=EC=84=B1=20=EB=8F=84?= =?UTF-8?q?=EC=9E=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue #5 (B 분기: 텍스트 + 기본 구조화) 작업의 사전 정비. - MDVIEW_SPEC.md: future_considerations 에 "PDF 임포트 — 텍스트 + 기본 구조화(제목/문단/목록)만 (Phase 2)" 항목을 DOCX 임포트 옆에 추가. - package.json: pdfjs-dist@^5.7 (Mozilla, Apache-2.0) 추가. 동적 import + Web Worker 패턴으로 사용해 메인 번들 영향 최소화. Refs #5 --- MDVIEW_SPEC.md | 1 + package-lock.json | 268 +++++++++++++++++++++++++++++++++++++++++++++- package.json | 1 + 3 files changed, 268 insertions(+), 2 deletions(-) diff --git a/MDVIEW_SPEC.md b/MDVIEW_SPEC.md index a1f6a33..daacbca 100644 --- a/MDVIEW_SPEC.md +++ b/MDVIEW_SPEC.md @@ -50,6 +50,7 @@ CRITICAL: 프론트엔드는 React + Vite 기반 SPA로 구축합니다. 에디 - AI 글쓰기 보조 (Phase 3) - 플러그인/확장 시스템 (Phase 3) - DOCX 임포트/내보내기 (Phase 2) + - PDF 임포트 — 텍스트 + 기본 구조화(제목/문단/목록)만 (Phase 2) - 프레젠테이션 모드 - Marp 기반 (Phase 3) diff --git a/package-lock.json b/package-lock.json index 09dc8ce..58864ca 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,13 @@ { - "name": "mdview-temp", + "name": "mdview", "version": "0.1.0", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "mdview-temp", + "name": "mdview", "version": "0.1.0", + "license": "Apache-2.0", "dependencies": { "@dnd-kit/core": "^6.3.1", "@dnd-kit/sortable": "^10.0.0", @@ -49,6 +50,7 @@ "nanoid": "^5.1.6", "next": "16.1.6", "pako": "^2.1.0", + "pdfjs-dist": "^5.7.284", "react": "19.2.3", "react-dom": "19.2.3", "sonner": "^2.0.7", @@ -1154,6 +1156,256 @@ "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==", "license": "BSD-2-Clause" }, + "node_modules/@napi-rs/canvas": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.100.tgz", + "integrity": "sha512-xglYA6q3XO5P3BNJYxVZ1IV7DLVjp1Py6nwag88YntrS+3vKHyYcMqXVS4ZztJmwz2uGvz1FWhI/4LgbR5uQDA==", + "license": "MIT", + "optional": true, + "workspaces": [ + "e2e/*" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + }, + "optionalDependencies": { + "@napi-rs/canvas-android-arm64": "0.1.100", + "@napi-rs/canvas-darwin-arm64": "0.1.100", + "@napi-rs/canvas-darwin-x64": "0.1.100", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.100", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.100", + "@napi-rs/canvas-linux-arm64-musl": "0.1.100", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.100", + "@napi-rs/canvas-linux-x64-gnu": "0.1.100", + "@napi-rs/canvas-linux-x64-musl": "0.1.100", + "@napi-rs/canvas-win32-arm64-msvc": "0.1.100", + "@napi-rs/canvas-win32-x64-msvc": "0.1.100" + } + }, + "node_modules/@napi-rs/canvas-android-arm64": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.100.tgz", + "integrity": "sha512-hjhCKhntPv9+t4ckHymdx0phYNcVW+GKQR6Lzw2zE+pOVjOplSmtx9nNNknTjbEDLcuLZqA1y8ufKg1XfgftzQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-arm64": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.100.tgz", + "integrity": "sha512-2PcswRaC7Ly645DGt88///zuFDhJxJYdKAs1uU3mfk1atYkXufgcgLfBpk6Tm12nCQBaNt1wpybuPZ4qOhTo8A==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-x64": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.100.tgz", + "integrity": "sha512-ePNZtj7pNIva/siZMg+HmbeozkIjqUIYdoymH8HaA3qK7LfzFN4WMBM8G6HQ9ZC+H3+Dnn5pqtiXpgLykaPOhw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.100.tgz", + "integrity": "sha512-d5cDB48oWFGU8/XPhUOFAlySgb/VAu7D+s8fi55K1Pcfg8aPplHWqMgibhVLU8ky7Pyg/fuiVLz4Nf3JrSTuUA==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-gnu": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.100.tgz", + "integrity": "sha512-rDxgxRu69RvDlX/bh9o22DxLsGr8EqsNgotL9+RwQE1S0b0cqeatqsw6aW45mukm0B42DIAaAacKaYQ8cqS1nw==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-musl": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.100.tgz", + "integrity": "sha512-K3mDW66N+xT2/V439u1alFANiBUjdEx2gLiNYnCmUsva5jZMxWTjafBYwTzYK+EMFMHrUoabuU+T1BIP5CgbYQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.100.tgz", + "integrity": "sha512-mooqUBTIsccZpnoQC4NgrC1v6C1vof39etLNMnBwCY+p0gajWJvAHLGQ6g/gGyS5YrpDW+GefSN4+Cvcr08UWw==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-gnu": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.100.tgz", + "integrity": "sha512-1eCvkDCazm7FFhsT7DfGOdSaHgZVK3bt/dSBl5EWHOWmnz+I7j8tPseJqqD81NF+MH21jKUK4wQSDjN0mdhnTg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-musl": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.100.tgz", + "integrity": "sha512-20arT6lnI19S68qNlii73TSEDbECNgzMz2EpldC1V3mZFuRkeujXkcebRk0LRJe9SEUAooYiLokfMViY8IX7yA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-arm64-msvc": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-arm64-msvc/-/canvas-win32-arm64-msvc-0.1.100.tgz", + "integrity": "sha512-DZFFT1wIAg37LJw37yhMRFfjATd3vTQzjZ1Yki8u2vhO6Hi5VE6BVaGQ1aaDu7xb4iMErz+9EOwjpS7xcxFeBw==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-x64-msvc": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.100.tgz", + "integrity": "sha512-MyT1j3mHC2+Lu4pBi9mKyMJhtP6U7k7EldY7sj/uS5gJA65gTXt8MefJQXLJo5d/vZbuWmfxzkEUNc/urV3pHA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, "node_modules/@napi-rs/wasm-runtime": { "version": "0.2.12", "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz", @@ -6898,6 +7150,18 @@ "dev": true, "license": "MIT" }, + "node_modules/pdfjs-dist": { + "version": "5.7.284", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.7.284.tgz", + "integrity": "sha512-h4EdYQczmGhbOlqc3PPZwxevn7ApdWPbovAuWXOB/DjIyigSnwfy2oze7c6mRcSr9XgLp3eN3EeL4DyySTPMFw==", + "license": "Apache-2.0", + "engines": { + "node": ">=22.13.0 || >=24" + }, + "optionalDependencies": { + "@napi-rs/canvas": "^0.1.100" + } + }, "node_modules/picocolors": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", diff --git a/package.json b/package.json index 31eb889..12c3127 100644 --- a/package.json +++ b/package.json @@ -75,6 +75,7 @@ "nanoid": "^5.1.6", "next": "16.1.6", "pako": "^2.1.0", + "pdfjs-dist": "^5.7.284", "react": "19.2.3", "react-dom": "19.2.3", "sonner": "^2.0.7", From 1d287bca0814653f73d94701e7fb6c037159d5da Mon Sep 17 00:00:00 2001 From: revfactory Date: Thu, 28 May 2026 20:38:50 +0900 Subject: [PATCH 2/5] =?UTF-8?q?feat:=20PDF=20=EC=9E=84=ED=8F=AC=ED=8A=B8?= =?UTF-8?q?=20=EC=BD=94=EC=96=B4=20(types=20+=20=EB=B3=80=ED=99=98=20?= =?UTF-8?q?=EB=9D=BC=EC=9D=B4=EB=B8=8C=EB=9F=AC=EB=A6=AC=20+=20Web=20Worke?= =?UTF-8?q?r)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue #5 B 분기 — 텍스트와 기본 구조화만 추출하는 1차 구현. 추가 파일: - src/types/pdf.ts: PdfBlock / PdfImportOptions / PdfParseResult / PdfParsedPage 등 워커-UI 간 메시지·결과 타입 정의. - src/lib/pdf-converter.ts: pdfjs 페이지 라인 → 마크다운 블록 변환 휴리스틱. - 제목: 페이지 평균 폰트 1.2배 이상인 줄을 H1~H3 으로 매핑. - 목록: •/·/-/*/1./1)/① 등 마커 인식 → 순서·비순서 리스트. - 문단: 동일 단락 라인은 공백으로 이어 붙여 단일 블록화. - 페이지 사이: 빈 줄만 삽입 (페이지 번호·머리말·꼬리말 제거, 줄끝 하이픈 복원은 후속 PR — TODO 주석으로 명시). - src/workers/pdf-parser.worker.ts: pdfjs-dist 를 동적 import 한 뒤 중첩 워커 없이(fake-worker 모드) 페이지별 텍스트와 폰트 크기를 추출. 진행률 메시지 + %PDF 매직 검증 + 스캔본 추정 경고 포함. Refs #5 --- src/lib/pdf-converter.ts | 238 +++++++++++++++++++++++++ src/types/pdf.ts | 96 ++++++++++ src/workers/pdf-parser.worker.ts | 295 +++++++++++++++++++++++++++++++ 3 files changed, 629 insertions(+) create mode 100644 src/lib/pdf-converter.ts create mode 100644 src/types/pdf.ts create mode 100644 src/workers/pdf-parser.worker.ts diff --git a/src/lib/pdf-converter.ts b/src/lib/pdf-converter.ts new file mode 100644 index 0000000..7ee3b90 --- /dev/null +++ b/src/lib/pdf-converter.ts @@ -0,0 +1,238 @@ +/** + * PDF → Markdown converter (Branch B: 텍스트 + 기본 구조화만). + * + * Heuristics: + * - 제목 추정: 페이지 평균 글자 크기 대비 1.2배 이상인 줄을 `#` ~ `###` 으로 매핑 + * (최대 3단계). 가장 큰 비율 → H1, 그 다음 → H2, 그 외 → H3. + * - 목록 추정: 줄 앞의 `•`, `·`, `-`, `*`, `1.`, `1)`, `①` 등 마커를 인식해 + * 마크다운 리스트로 변환. + * - 문단: 그 외 일반 텍스트는 단락으로 묶고, 페이지 사이에 빈 줄 삽입. + * + * 본 PR 범위에 포함되지 않은 항목: + * - 페이지 번호 / 머리말 / 꼬리말 자동 제거 (TODO) + * - 줄 끝 하이픈 (word-break) 복원 (TODO) + * - 다단(컬럼) 레이아웃 인식, 표 추출, OCR + * + * 위 TODO는 후속 PR에서 다룬다. + */ + +import type { PdfBlock, PdfPageLine, PdfParsedPage } from '@/types/pdf'; + +// Backwards-compatible local aliases (kept short for readability below). +type PageLine = PdfPageLine; +type ParsedPage = PdfParsedPage; + +// ---------- List-marker detection ---------- + +const UNORDERED_MARKERS = ['•', '·', '◦', '▪', '▫', '■', '●', '○', '※', '–', '—']; + +/** Regex for ordered list markers at the start of a line. */ +const ORDERED_RE = /^(\(?\d{1,3}[.)]|[①-⑳㉑-㉟㊱-㊿]|[ⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹ]+[.)]?|[IVX]+\.)\s+/; + +interface ListDetection { + matched: boolean; + ordered: boolean; + /** Text with the leading marker stripped. */ + body: string; +} + +function detectListMarker(rawText: string): ListDetection { + const text = rawText.trimStart(); + + // Unordered: leading bullet glyph + for (const m of UNORDERED_MARKERS) { + if (text.startsWith(m)) { + const body = text.slice(m.length).trim(); + if (body.length > 0) { + return { matched: true, ordered: false, body }; + } + } + } + + // Hyphen / asterisk bullets (require a following space) + if (/^[-*]\s+/.test(text)) { + return { + matched: true, + ordered: false, + body: text.replace(/^[-*]\s+/, '').trim(), + }; + } + + // Ordered + const m = text.match(ORDERED_RE); + if (m) { + return { + matched: true, + ordered: true, + body: text.slice(m[0].length).trim(), + }; + } + + return { matched: false, ordered: false, body: text }; +} + +// ---------- Heading detection ---------- + +interface HeadingThresholds { + pageAverage: number; + /** Distinct "large" sizes in descending order — first three become H1/H2/H3. */ + largeSizes: number[]; +} + +function computeHeadingThresholds(lines: PageLine[]): HeadingThresholds { + if (lines.length === 0) { + return { pageAverage: 0, largeSizes: [] }; + } + + const totalChars = lines.reduce((sum, l) => sum + Math.max(l.text.length, 1), 0); + const weightedSum = lines.reduce( + (sum, l) => sum + l.fontSize * Math.max(l.text.length, 1), + 0 + ); + const pageAverage = totalChars > 0 ? weightedSum / totalChars : 0; + + // Collect candidate heading sizes — round to 1 decimal to group "near-equal" sizes + const cutoff = pageAverage * 1.2; + const sizeSet = new Set(); + for (const line of lines) { + if (line.fontSize >= cutoff) { + sizeSet.add(Math.round(line.fontSize * 10) / 10); + } + } + + const largeSizes = Array.from(sizeSet).sort((a, b) => b - a).slice(0, 3); + return { pageAverage, largeSizes }; +} + +function headingLevelFor(fontSize: number, thresholds: HeadingThresholds): 1 | 2 | 3 | null { + if (thresholds.pageAverage <= 0) return null; + if (fontSize < thresholds.pageAverage * 1.2) return null; + + const rounded = Math.round(fontSize * 10) / 10; + const idx = thresholds.largeSizes.indexOf(rounded); + if (idx === 0) return 1; + if (idx === 1) return 2; + if (idx === 2) return 3; + // Larger than cutoff but not in top-3 distinct sizes → H3 + return 3; +} + +// ---------- Page → Blocks ---------- + +export function convertPagesToBlocks(pages: ParsedPage[]): PdfBlock[] { + const blocks: PdfBlock[] = []; + + for (const page of pages) { + if (page.lines.length === 0) continue; + + const thresholds = computeHeadingThresholds(page.lines); + + // Buffer for consecutive paragraph lines so we can merge them into a single block + let paragraphBuffer: string[] = []; + + const flushParagraph = () => { + if (paragraphBuffer.length === 0) return; + const text = paragraphBuffer.join(' ').replace(/\s+/g, ' ').trim(); + if (text.length > 0) { + blocks.push({ type: 'paragraph', text, page: page.pageNumber }); + } + paragraphBuffer = []; + }; + + for (const line of page.lines) { + const list = detectListMarker(line.text); + if (list.matched) { + flushParagraph(); + blocks.push({ + type: 'list-item', + text: list.body, + ordered: list.ordered, + page: page.pageNumber, + }); + continue; + } + + const headingLevel = headingLevelFor(line.fontSize, thresholds); + if (headingLevel !== null) { + flushParagraph(); + blocks.push({ + type: 'heading', + text: line.text.trim(), + headingLevel, + page: page.pageNumber, + }); + continue; + } + + // Regular text line — accumulate into the current paragraph buffer + paragraphBuffer.push(line.text); + } + + flushParagraph(); + } + + return blocks; +} + +// ---------- Blocks → Markdown ---------- + +/** + * Escape characters with special meaning in markdown. Keep this minimal so that + * common punctuation stays readable. + */ +function escapeMarkdownInline(text: string): string { + return text + // Escape leading characters that would be parsed as block syntax + .replace(/^(\s*)([#>|])/, '$1\\$2') + // Escape backticks to avoid accidental inline code spans + .replace(/`/g, '\\`'); +} + +export function blocksToMarkdown(blocks: PdfBlock[]): string { + const lines: string[] = []; + let lastPage = -1; + let orderedCounter = 0; + let lastBlockWasOrderedList = false; + + for (const block of blocks) { + // Insert a blank line between pages (page break = blank line only — page + // number / header / footer normalization is out of scope, see file header). + if (lastPage !== -1 && block.page !== lastPage) { + if (lines[lines.length - 1] !== '') lines.push(''); + orderedCounter = 0; + lastBlockWasOrderedList = false; + } + + if (block.type === 'heading') { + if (lines.length > 0 && lines[lines.length - 1] !== '') lines.push(''); + const hashes = '#'.repeat(block.headingLevel ?? 3); + lines.push(`${hashes} ${escapeMarkdownInline(block.text)}`); + lines.push(''); + orderedCounter = 0; + lastBlockWasOrderedList = false; + } else if (block.type === 'list-item') { + if (block.ordered) { + if (!lastBlockWasOrderedList) orderedCounter = 0; + orderedCounter += 1; + lines.push(`${orderedCounter}. ${escapeMarkdownInline(block.text)}`); + lastBlockWasOrderedList = true; + } else { + lines.push(`- ${escapeMarkdownInline(block.text)}`); + lastBlockWasOrderedList = false; + orderedCounter = 0; + } + } else { + // paragraph + if (lines.length > 0 && lines[lines.length - 1] !== '') lines.push(''); + lines.push(escapeMarkdownInline(block.text)); + lines.push(''); + orderedCounter = 0; + lastBlockWasOrderedList = false; + } + + lastPage = block.page; + } + + // Collapse 3+ trailing blank lines down to a single trailing newline + return lines.join('\n').replace(/\n{3,}/g, '\n\n').trim(); +} diff --git a/src/types/pdf.ts b/src/types/pdf.ts new file mode 100644 index 0000000..5de50e8 --- /dev/null +++ b/src/types/pdf.ts @@ -0,0 +1,96 @@ +/** + * PDF import types. + * + * Phase 2 scope (Branch B): 텍스트 추출 + 기본 구조화(제목/문단/목록)만. + * 표·이미지·OCR은 본 PR 범위에 포함되지 않으며 후속 작업에서 다룬다. + */ + +// ---------- Intermediate per-page line representation ---------- + +/** Single visual line extracted from a PDF page. */ +export interface PdfPageLine { + text: string; + /** Largest font size encountered on the line (px). */ + fontSize: number; +} + +/** Parsed page produced by the worker before heuristic conversion. */ +export interface PdfParsedPage { + pageNumber: number; + lines: PdfPageLine[]; +} + +// ---------- Block types extracted from a PDF ---------- + +export type PdfBlockType = 'heading' | 'paragraph' | 'list-item'; + +export interface PdfBlock { + type: PdfBlockType; + /** Plain text content of the block (HTML/markdown not pre-formatted). */ + text: string; + /** Heading level 1..3 when type === 'heading'. */ + headingLevel?: 1 | 2 | 3; + /** Ordered list marker presence — '1.' / '①' etc. */ + ordered?: boolean; + /** Source page (1-indexed). */ + page: number; +} + +// ---------- Worker contract ---------- + +export interface PdfImportOptions { + /** + * Maximum number of pages to parse. 0 = no limit. + * Used to short-circuit very large PDFs. + */ + maxPages?: number; +} + +export interface PdfParseMessage { + type: 'parse'; + /** Raw PDF file bytes. */ + file: ArrayBuffer; + options?: PdfImportOptions; +} + +export interface PdfProgressResponse { + type: 'progress'; + percent: number; + message: string; +} + +export interface PdfCompleteResponse { + type: 'complete'; + result: PdfParseResult; +} + +export interface PdfErrorResponse { + type: 'error'; + error: string; +} + +export type PdfWorkerResponse = + | PdfProgressResponse + | PdfCompleteResponse + | PdfErrorResponse; + +// ---------- Result returned to the UI ---------- + +export interface PdfParseResult { + /** Final markdown text assembled from blocks. */ + markdown: string; + /** Raw blocks for callers that want to post-process. */ + blocks: PdfBlock[]; + /** Soft warnings (e.g. scanned page detected, page skipped). */ + warnings: string[]; + metadata: PdfDocumentMetadata; +} + +export interface PdfDocumentMetadata { + pageCount: number; + parsedPageCount: number; + title?: string; + author?: string; + /** True if at least one page returned almost no text — likely scanned/image PDF. */ + likelyScanned: boolean; +} diff --git a/src/workers/pdf-parser.worker.ts b/src/workers/pdf-parser.worker.ts new file mode 100644 index 0000000..c27bd13 --- /dev/null +++ b/src/workers/pdf-parser.worker.ts @@ -0,0 +1,295 @@ +/// + +/** + * PDF parser Web Worker. + * + * Loads pdfjs-dist dynamically and extracts text + per-line font size from each + * page so the converter can apply heading/list heuristics. Runs pdfjs-dist in + * "fake worker" mode (no nested worker) since we're already inside a worker + * context — this avoids needing a separate worker file and keeps the bundle + * graph predictable. + * + * Branch B scope (issue #5): 텍스트 + 기본 구조화만. 표/이미지/OCR 제외. + */ + +import type { + PdfBlock, + PdfImportOptions, + PdfPageLine, + PdfParsedPage, + PdfParseMessage, + PdfParseResult, + PdfWorkerResponse, +} from '@/types/pdf'; +import { convertPagesToBlocks, blocksToMarkdown } from '@/lib/pdf-converter'; + +declare const self: DedicatedWorkerGlobalScope; + +function postProgress(percent: number, message: string) { + self.postMessage({ type: 'progress', percent, message } as PdfWorkerResponse); +} + +function postComplete(result: PdfParseResult) { + self.postMessage({ type: 'complete', result } as PdfWorkerResponse); +} + +function postError(error: string) { + self.postMessage({ type: 'error', error } as PdfWorkerResponse); +} + +// ---------- pdfjs-dist text-item shape (subset we use) ---------- + +interface PdfTextItem { + str: string; + transform: number[]; // [a, b, c, d, e, f] — height ≈ |d| + height?: number; + width?: number; + hasEOL?: boolean; +} + +type PageLine = PdfPageLine; +type ParsedPage = PdfParsedPage; + +// ---------- Per-page extraction ---------- + +/** + * Convert raw pdfjs text items into "lines" by grouping items whose vertical + * position is similar. Returns the lines and an average font size for the page. + */ +function groupItemsIntoLines(items: PdfTextItem[]): PageLine[] { + if (items.length === 0) return []; + + interface LineBucket { + y: number; + items: PdfTextItem[]; + } + + const buckets: LineBucket[] = []; + const Y_TOLERANCE = 2; // px tolerance when grouping into the same line + + for (const item of items) { + if (!item || typeof item.str !== 'string') continue; + const tr = item.transform; + if (!Array.isArray(tr) || tr.length < 6) continue; + const y = tr[5]; + + let bucket = buckets.find((b) => Math.abs(b.y - y) <= Y_TOLERANCE); + if (!bucket) { + bucket = { y, items: [] }; + buckets.push(bucket); + } + bucket.items.push(item); + } + + // Sort buckets top-to-bottom (higher y first in PDF coordinate space) + buckets.sort((a, b) => b.y - a.y); + + const lines: PageLine[] = []; + for (const bucket of buckets) { + // Sort items within the bucket by x position + bucket.items.sort((a, b) => a.transform[4] - b.transform[4]); + + let lineText = ''; + let lastX = -Infinity; + let lastWidth = 0; + let maxFontSize = 0; + + for (const item of bucket.items) { + const tr = item.transform; + const x = tr[4]; + const fontSize = Math.abs(tr[3]) || item.height || 0; + if (fontSize > maxFontSize) maxFontSize = fontSize; + + // Insert a space if there's a visual gap between items + if (lineText.length > 0) { + const expectedX = lastX + lastWidth; + const gap = x - expectedX; + if (gap > fontSize * 0.25 && !lineText.endsWith(' ') && !item.str.startsWith(' ')) { + lineText += ' '; + } + } + + lineText += item.str; + lastX = x; + lastWidth = item.width ?? 0; + } + + // Collapse runs of whitespace + lineText = lineText.replace(/[ \t]+/g, ' ').trim(); + + if (lineText.length > 0) { + lines.push({ text: lineText, fontSize: maxFontSize }); + } + } + + return lines; +} + +// ---------- Main parsing pipeline ---------- + +async function parsePdf( + buffer: ArrayBuffer, + options?: PdfImportOptions +): Promise { + postProgress(5, 'PDF 라이브러리 로드 중...'); + + // Dynamic import keeps pdfjs-dist out of the main bundle + const pdfjs = await import('pdfjs-dist'); + + // We're already in a Web Worker — disable pdfjs's nested worker spawning. + // Without this, pdfjs would try to create another Worker which is unreliable + // inside a worker context across browsers. + try { + pdfjs.GlobalWorkerOptions.workerSrc = ''; + } catch { + // Ignored — falls back to fake worker mode + } + + postProgress(10, 'PDF 파일 분석 중...'); + + const loadingTask = pdfjs.getDocument({ + data: new Uint8Array(buffer), + // Disable optional fancy features we don't need for text extraction + disableFontFace: true, + useSystemFonts: false, + }); + + const pdf = await loadingTask.promise; + const totalPages = pdf.numPages; + const maxPages = + options?.maxPages && options.maxPages > 0 + ? Math.min(options.maxPages, totalPages) + : totalPages; + + postProgress( + 15, + `총 ${totalPages}페이지${ + maxPages < totalPages ? ` (처음 ${maxPages}페이지만 처리)` : '' + }` + ); + + // Extract metadata (best-effort) + let title: string | undefined; + let author: string | undefined; + try { + const meta = await pdf.getMetadata(); + const info = (meta?.info ?? {}) as Record; + if (typeof info.Title === 'string' && info.Title.trim()) { + title = info.Title.trim(); + } + if (typeof info.Author === 'string' && info.Author.trim()) { + author = info.Author.trim(); + } + } catch { + // metadata is optional — ignore + } + + const warnings: string[] = []; + const pages: ParsedPage[] = []; + let likelyScannedCount = 0; + + for (let pageNumber = 1; pageNumber <= maxPages; pageNumber++) { + // Progress range 15 → 90 + const pct = 15 + Math.round((pageNumber / maxPages) * 75); + postProgress(pct, `페이지 ${pageNumber}/${maxPages} 처리 중...`); + + try { + const page = await pdf.getPage(pageNumber); + const textContent = await page.getTextContent({ + includeMarkedContent: false, + }); + const items = (textContent.items as PdfTextItem[]).filter( + (it) => it && typeof (it as PdfTextItem).str === 'string' + ); + + const lines = groupItemsIntoLines(items); + const totalChars = lines.reduce((sum, l) => sum + l.text.length, 0); + if (totalChars < 10) { + likelyScannedCount++; + } + + pages.push({ pageNumber, lines }); + + // Release page resources eagerly + page.cleanup(); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + warnings.push(`페이지 ${pageNumber} 처리 실패: ${msg}`); + pages.push({ pageNumber, lines: [] }); + } + } + + postProgress(92, '마크다운 변환 중...'); + + const likelyScanned = + likelyScannedCount > 0 && likelyScannedCount >= Math.ceil(maxPages * 0.5); + if (likelyScanned) { + warnings.push( + '대부분의 페이지에서 텍스트를 거의 추출하지 못했습니다. 스캔본 PDF일 수 있으며, OCR은 지원하지 않습니다.' + ); + } else if (likelyScannedCount > 0) { + warnings.push( + `${likelyScannedCount}개 페이지에서 텍스트를 거의 추출하지 못했습니다 (이미지/스캔본 가능성).` + ); + } + + const blocks: PdfBlock[] = convertPagesToBlocks(pages); + const markdown = blocksToMarkdown(blocks); + + postProgress(100, '변환 완료'); + + // Release pdf document + try { + await pdf.cleanup(); + await pdf.destroy(); + } catch { + // ignore + } + + return { + markdown, + blocks, + warnings, + metadata: { + pageCount: totalPages, + parsedPageCount: maxPages, + title, + author, + likelyScanned, + }, + }; +} + +// ---------- Message handler ---------- + +self.onmessage = async (e: MessageEvent) => { + const msg = e.data; + + if (!msg || msg.type !== 'parse') { + postError(`알 수 없는 메시지 타입: ${(msg as { type?: string } | undefined)?.type ?? ''}`); + return; + } + + try { + // Quick sanity check on the buffer prefix — "%PDF" + const view = new Uint8Array(msg.file); + if ( + view.length < 5 || + view[0] !== 0x25 || // % + view[1] !== 0x50 || // P + view[2] !== 0x44 || // D + view[3] !== 0x46 // F + ) { + throw new Error('PDF 파일이 아닙니다. 올바른 .pdf 파일을 선택해주세요.'); + } + + const result = await parsePdf(msg.file, msg.options); + postComplete(result); + } catch (err) { + const message = + err instanceof Error ? err.message : '알 수 없는 오류가 발생했습니다.'; + postError(message); + } +}; + +export {}; From 431ed9fe0e760636ceeb325401343c85c5e2b1cb Mon Sep 17 00:00:00 2001 From: revfactory Date: Thu, 28 May 2026 20:39:00 +0900 Subject: [PATCH 3/5] =?UTF-8?q?feat:=20PDF=20=EC=9E=84=ED=8F=AC=ED=8A=B8?= =?UTF-8?q?=20=ED=9B=85=20+=20=EB=8B=A4=EC=9D=B4=EC=96=BC=EB=A1=9C?= =?UTF-8?q?=EA=B7=B8=20=EC=BB=B4=ED=8F=AC=EB=84=8C=ED=8A=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue #5 B 분기 — use-hwp.ts / hwp-import.tsx 패턴을 그대로 복제해 UI 톤을 통일. - src/hooks/use-pdf.ts: usePdfImport — File → Worker → 진행률/ 결과/에러 상태 관리. transferable ArrayBuffer 로 복사 비용 제거. - src/components/features/import-export/pdf-import.tsx: 드롭존 + 파일 선택 + 진행률 + 결과·경고·오류 모달. 상단에 "텍스트와 기본 구조만 추출, 표·이미지·OCR 미지원" 안내를 상시 노출. 50MB 초과 시 시간 경고, 200페이지 초과 시 정확도 경고. Refs #5 --- .../features/import-export/pdf-import.tsx | 295 ++++++++++++++++++ src/hooks/use-pdf.ts | 118 +++++++ 2 files changed, 413 insertions(+) create mode 100644 src/components/features/import-export/pdf-import.tsx create mode 100644 src/hooks/use-pdf.ts diff --git a/src/components/features/import-export/pdf-import.tsx b/src/components/features/import-export/pdf-import.tsx new file mode 100644 index 0000000..1cbe4f4 --- /dev/null +++ b/src/components/features/import-export/pdf-import.tsx @@ -0,0 +1,295 @@ +'use client'; + +import React, { useState, useCallback, useRef } from 'react'; +import { + Upload, + FileText, + AlertTriangle, + CheckCircle, + X, + Info, +} from 'lucide-react'; +import { Modal } from '@/components/ui/modal'; +import { Button } from '@/components/ui/button'; +import { usePdfImport } from '@/hooks/use-pdf'; +import type { PdfParseResult } from '@/types/pdf'; + +interface PdfImportProps { + open: boolean; + onClose: () => void; + onImportComplete: (content: string, title: string) => void; +} + +const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50 MB +const LARGE_PAGE_THRESHOLD = 200; + +export function PdfImport({ open, onClose, onImportComplete }: PdfImportProps) { + const { importFile, progress, progressMessage, isImporting, error, reset } = + usePdfImport(); + const [selectedFile, setSelectedFile] = useState(null); + const [result, setResult] = useState(null); + const [isDragOver, setIsDragOver] = useState(false); + const [sizeWarning, setSizeWarning] = useState(null); + const [localError, setLocalError] = useState(null); + const fileInputRef = useRef(null); + + const displayedError = localError ?? error; + + const handleClose = useCallback(() => { + if (isImporting) return; + setSelectedFile(null); + setResult(null); + setIsDragOver(false); + setSizeWarning(null); + setLocalError(null); + reset(); + onClose(); + }, [isImporting, reset, onClose]); + + const processFile = useCallback( + async (file: File) => { + setLocalError(null); + reset(); + + // Basic extension/mime check — strict to avoid spending time on non-PDF files + if (!/\.pdf$/i.test(file.name) && file.type !== 'application/pdf') { + setSelectedFile(file); + setResult(null); + setSizeWarning(null); + setLocalError('PDF 파일(.pdf)만 지원합니다.'); + return; + } + + setSelectedFile(file); + setResult(null); + + // Size warning (non-blocking) + if (file.size > MAX_FILE_SIZE) { + setSizeWarning( + `파일이 매우 큽니다(${(file.size / (1024 * 1024)).toFixed(1)} MB). 변환 시간이 오래 걸릴 수 있습니다.` + ); + } else { + setSizeWarning(null); + } + + try { + const res = await importFile(file); + setResult(res); + } catch (err) { + // error state is set inside the hook; log details for the user + console.error('[PDF import] failed:', err); + } + }, + [importFile, reset] + ); + + const handleFileChange = useCallback( + (e: React.ChangeEvent) => { + const file = e.target.files?.[0]; + if (file) void processFile(file); + }, + [processFile] + ); + + const handleDrop = useCallback( + (e: React.DragEvent) => { + e.preventDefault(); + setIsDragOver(false); + const file = e.dataTransfer.files[0]; + if (file) void processFile(file); + }, + [processFile] + ); + + const handleDragOver = useCallback((e: React.DragEvent) => { + e.preventDefault(); + setIsDragOver(true); + }, []); + + const handleDragLeave = useCallback((e: React.DragEvent) => { + e.preventDefault(); + setIsDragOver(false); + }, []); + + const handleConfirm = useCallback(() => { + if (!result || !selectedFile) return; + const baseTitle = selectedFile.name.replace(/\.pdf$/i, ''); + const title = result.metadata.title?.trim() || baseTitle || 'PDF 문서'; + onImportComplete(result.markdown, title); + handleClose(); + }, [result, selectedFile, onImportComplete, handleClose]); + + const titleFromResult = + result?.metadata.title?.trim() || + (selectedFile ? selectedFile.name.replace(/\.pdf$/i, '') : ''); + + const showLargePageWarning = + !!result && result.metadata.parsedPageCount > LARGE_PAGE_THRESHOLD; + + return ( + +
+ {/* Scope notice — always visible at the top */} +
+ +

+ 텍스트와 기본 구조(제목 · 문단 · 목록)만 추출합니다. +
+ 표 · 이미지 · OCR(스캔본)은 아직 지원하지 않습니다. +

+
+ + {/* Drop zone */} + {!isImporting && !result && ( +
fileInputRef.current?.click()} + > + + {displayedError ? ( + <> +
+ +
+

+ 변환 실패 +

+

+ {displayedError} +

+

+ 다른 파일을 선택해주세요 +

+ + ) : ( + <> +
+ +
+

+ PDF 파일을 끌어다 놓으세요 +

+

+ 또는 클릭하여 파일 선택 +

+ + )} +
+ )} + + {/* File size warning (non-blocking, before import completes) */} + {sizeWarning && !result && ( +
+ +

+ {sizeWarning} +

+
+ )} + + {/* Progress bar */} + {isImporting && selectedFile && ( +
+
+ +
+

+ {selectedFile.name} +

+

+ {progressMessage || '변환 중...'} +

+
+ + {progress}% + +
+
+
+
+
+ )} + + {/* Result */} + {result && selectedFile && ( +
+
+ +
+

+ 변환 완료! +

+

+ “{titleFromResult}” · {result.metadata.parsedPageCount} + /{result.metadata.pageCount} 페이지 +

+
+
+ + {showLargePageWarning && ( +
+ +

+ 페이지 수가 많아 일부 페이지의 변환 정확도가 떨어질 수 있습니다. +

+
+ )} + + {result.warnings.length > 0 && ( +
+
+ +

+ 경고 ({result.warnings.length}건) +

+
+
    + {result.warnings.map((warning, idx) => ( +
  • + {warning} +
  • + ))} +
+
+ )} + +
+ + +
+
+ )} +
+ + ); +} diff --git a/src/hooks/use-pdf.ts b/src/hooks/use-pdf.ts new file mode 100644 index 0000000..9e6797d --- /dev/null +++ b/src/hooks/use-pdf.ts @@ -0,0 +1,118 @@ +'use client'; + +import { useCallback, useEffect, useRef, useState } from 'react'; +import type { + PdfImportOptions, + PdfParseResult, + PdfWorkerResponse, +} from '@/types/pdf'; + +/** + * usePdfImport — file → Worker → progress/result/error state. + * + * Mirrors the API of `useHwpImport` so callers can drop it into the same + * import-dialog component layout. + */ +export function usePdfImport() { + const [progress, setProgress] = useState(0); + const [progressMessage, setProgressMessage] = useState(''); + const [isImporting, setIsImporting] = useState(false); + const [error, setError] = useState(null); + const workerRef = useRef(null); + + useEffect(() => { + return () => { + workerRef.current?.terminate(); + workerRef.current = null; + }; + }, []); + + const importFile = useCallback( + async ( + file: File, + options?: PdfImportOptions + ): Promise => { + // Terminate any in-flight worker + workerRef.current?.terminate(); + + setIsImporting(true); + setProgress(0); + setProgressMessage(''); + setError(null); + + const buffer = await file.arrayBuffer(); + + return new Promise((resolve, reject) => { + const worker = new Worker( + new URL('../workers/pdf-parser.worker.ts', import.meta.url), + { type: 'module' } + ); + workerRef.current = worker; + + worker.onmessage = (e: MessageEvent) => { + const msg = e.data; + switch (msg.type) { + case 'progress': + setProgress(msg.percent); + setProgressMessage(msg.message); + break; + case 'complete': + setProgress(100); + setProgressMessage('변환 완료'); + setIsImporting(false); + worker.terminate(); + workerRef.current = null; + resolve(msg.result); + break; + case 'error': + setError(msg.error); + setIsImporting(false); + worker.terminate(); + workerRef.current = null; + reject(new Error(msg.error)); + break; + } + }; + + worker.onerror = (err) => { + const message = + err.message || 'PDF 변환 워커 실행 중 오류가 발생했습니다.'; + setError(message); + setIsImporting(false); + worker.terminate(); + workerRef.current = null; + reject(new Error(message)); + }; + + worker.postMessage( + { + type: 'parse', + file: buffer, + options, + }, + // Transfer the buffer to avoid copying — buffer is no longer usable + // on the main thread after this call, which is fine since we just + // produced it from File.arrayBuffer(). + [buffer] + ); + }); + }, + [] + ); + + const reset = useCallback(() => { + setProgress(0); + setProgressMessage(''); + setIsImporting(false); + setError(null); + }, []); + + return { + importFile, + progress, + progressMessage, + isImporting, + error, + reset, + }; +} From 6105b2926e21dffa9db1fea677dcb07a5a85f2d4 Mon Sep 17 00:00:00 2001 From: revfactory Date: Thu, 28 May 2026 20:41:16 +0900 Subject: [PATCH 4/5] =?UTF-8?q?feat:=20PDF=20=EC=9E=84=ED=8F=AC=ED=8A=B8?= =?UTF-8?q?=20=EC=A7=84=EC=9E=85=EC=A0=90=20=ED=86=B5=ED=95=A9=20(?= =?UTF-8?q?=EC=82=AC=EC=9D=B4=EB=93=9C=EB=B0=94=20+=20=EB=AA=A8=EB=8B=AC?= =?UTF-8?q?=20=EB=9D=BC=EC=9A=B0=ED=8C=85)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue #5 B 분기 — UI 진입점을 기존 MD/HWP 임포트 옆에 추가. - src/components/layout/sidebar.tsx: 사이드바 임포트 버튼 영역에 PDF 버튼 추가. 한 줄에 3개가 들어가도록 라벨을 MD/HWP/PDF 로 축약. onImportPdf prop 추가. - src/app/page.tsx: pdfImportOpen 상태 + handlePdfImportComplete 핸들러 추가. 변환 완료 후 "텍스트만 추출되었습니다. 표·이미지· OCR 미지원" 토스트 노출. 대용량 결과는 기존 청크 분할 파이프 라인 재사용. PdfImport 모달을 page tree 에 마운트. - src/lib/analytics.ts: analytics.importPdf 이벤트 추가. Closes #5 --- src/app/page.tsx | 61 +++++++++++++++++++++++++++++++ src/components/layout/sidebar.tsx | 13 ++++++- src/lib/analytics.ts | 3 ++ 3 files changed, 76 insertions(+), 1 deletion(-) diff --git a/src/app/page.tsx b/src/app/page.tsx index a70432d..ea5bb42 100644 --- a/src/app/page.tsx +++ b/src/app/page.tsx @@ -14,6 +14,7 @@ import { DocumentTitle } from '@/components/features/editor/document-title'; import { TocPanel } from '@/components/features/editor/toc-panel'; import { HwpImport } from '@/components/features/import-export/hwp-import'; import { MarkdownImport } from '@/components/features/import-export/markdown-import'; +import { PdfImport } from '@/components/features/import-export/pdf-import'; import { HwpExport } from '@/components/features/import-export/hwp-export'; import { ExportMenu } from '@/components/features/import-export/export-menu'; import { QuickOpen } from '@/components/features/quick-open/quick-open'; @@ -44,6 +45,7 @@ export default function Home() { const [tiptapEditor, setTiptapEditor] = useState(null); const [hwpImportOpen, setHwpImportOpen] = useState(false); const [mdImportOpen, setMdImportOpen] = useState(false); + const [pdfImportOpen, setPdfImportOpen] = useState(false); const [hwpExportOpen, setHwpExportOpen] = useState(false); const [quickOpenOpen, setQuickOpenOpen] = useState(false); @@ -321,6 +323,58 @@ export default function Home() { [actions] ); + const handlePdfImportComplete = useCallback( + async (importedContent: string, title: string) => { + const CHUNK_THRESHOLD = 300_000; + + if (importedContent.length > CHUNK_THRESHOLD) { + const { chunkDocument } = await import('@/lib/chunk-document'); + const { saveChunks } = await import('@/db/chunks'); + + const chunks = chunkDocument(importedContent); + const id = await createDocument({ + title, + content: importedContent, + isChunked: true, + chunkCount: chunks.length, + }); + await saveChunks(id, chunks); + actions.setActiveDocument(id); + + import('@/stores/toast-store').then(({ useToastStore }) => { + useToastStore.getState().actions.addToast( + `대용량 문서가 ${chunks.length}개 페이지로 분할되었습니다.`, + 'success' + ); + }); + } else { + let htmlContent = ''; + try { + const { markdownToHtmlAsync } = await import('@/lib/markdown'); + htmlContent = await markdownToHtmlAsync(importedContent); + } catch { + // Fallback + } + const id = await createDocument({ title, content: importedContent, htmlContent }); + actions.setActiveDocument(id); + } + + setPdfImportOpen(false); + analytics.importPdf(); + + // Phase 2 (Branch B) — 텍스트만 추출됨을 사용자에게 안내 + import('@/stores/toast-store').then(({ useToastStore }) => { + useToastStore + .getState() + .actions.addToast( + '텍스트만 추출되었습니다. 표 · 이미지 · OCR은 지원하지 않습니다.', + 'info' + ); + }); + }, + [actions] + ); + useKeyboardShortcuts({ onNewDocument: handleNewDocument, onForceSave: handleForceSave, @@ -346,6 +400,7 @@ export default function Home() { onDuplicateDocument={handleDuplicateDocument} onImport={() => setHwpImportOpen(true)} onImportMarkdown={() => setMdImportOpen(true)} + onImportPdf={() => setPdfImportOpen(true)} /> } focusMode={focusMode} @@ -489,6 +544,12 @@ export default function Home() { onImportComplete={handleMdImportComplete} /> + setPdfImportOpen(false)} + onImportComplete={handlePdfImportComplete} + /> + setHwpExportOpen(false)} diff --git a/src/components/layout/sidebar.tsx b/src/components/layout/sidebar.tsx index baac676..0365bb1 100644 --- a/src/components/layout/sidebar.tsx +++ b/src/components/layout/sidebar.tsx @@ -18,6 +18,7 @@ import { Pencil, Trash2, FileCode, + FileType, } from 'lucide-react'; import { SearchInput } from '../ui/search-input'; import { Tooltip } from '../ui/tooltip'; @@ -45,6 +46,7 @@ export interface SidebarProps { onDuplicateDocument?: (id: string) => void; onImport?: () => void; onImportMarkdown?: () => void; + onImportPdf?: () => void; onSettings?: () => void; } @@ -214,6 +216,7 @@ export function Sidebar({ onDuplicateDocument, onImport, onImportMarkdown, + onImportPdf, onSettings, }: SidebarProps) { const [searchQuery, setSearchQuery] = useState(''); @@ -365,7 +368,7 @@ export function Sidebar({ className="flex items-center justify-center gap-1.5 flex-1 h-8 rounded-lg bg-[var(--color-accent)] text-white hover:opacity-90 transition-opacity cursor-pointer text-xs font-medium" > - Markdown + MD +
{/* Search */} diff --git a/src/lib/analytics.ts b/src/lib/analytics.ts index 0226e41..8052ba1 100644 --- a/src/lib/analytics.ts +++ b/src/lib/analytics.ts @@ -40,6 +40,9 @@ export const analytics = { importHwp: () => trackEvent({ action: 'import', category: 'file', label: 'hwp' }), + importPdf: () => + trackEvent({ action: 'import', category: 'file', label: 'pdf' }), + exportMarkdown: () => trackEvent({ action: 'export', category: 'file', label: 'markdown' }), From 06cb29786fd8b2cda124f9ee682ac570e857db3d Mon Sep 17 00:00:00 2001 From: revfactory Date: Thu, 28 May 2026 20:41:34 +0900 Subject: [PATCH 5/5] =?UTF-8?q?docs:=20CHANGELOG=20=EC=97=90=20PDF=20?= =?UTF-8?q?=EC=9E=84=ED=8F=AC=ED=8A=B8(Phase=202,=20Branch=20B)=20?= =?UTF-8?q?=ED=95=AD=EB=AA=A9=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refs #5 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a930cb4..cd2f5e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ ## [Unreleased] ### Added -- _아직 항목 없음_ +- PDF 임포트 (Phase 2, Branch B) — `pdfjs-dist` 기반 Web Worker 로 텍스트 + 기본 구조(제목 · 문단 · 목록)만 추출. 표 · 이미지 · OCR 은 미지원 (후속). 사이드바에 PDF 버튼 추가. ([#5](https://github.com/revfactory/mdview/issues/5)) ### Changed - _아직 항목 없음_