previewLength = config('documents.markdown.preview_length', 500); } /** * 将文档转换为 Markdown * * @return array{markdown: string, media_files: array} */ public function convertToMarkdown(Document $document): array { $this->ensureConversionDependenciesAvailable(); $documentPath = Storage::disk('local')->path($document->file_path); if (!file_exists($documentPath)) { throw new \Exception("文档文件不存在: {$documentPath}"); } $doc = DocumentManager::open($documentPath, ['ocr' => false]); $markdown = DocumentManager::renderAs($doc, 'md'); if (empty(trim($markdown))) { throw new \Exception('文档转换后内容为空,可能是扫描件或不支持的内容格式'); } return [ 'markdown' => $markdown, 'media_files' => $this->extractMarkdownMediaFiles($doc), ]; } /** * 确保文档转换依赖已经安装 */ protected function ensureConversionDependenciesAvailable(): void { if (!class_exists(DocumentManager::class)) { throw new \RuntimeException( '文档转换依赖未安装:paperdoc-dev/paperdoc-lib。请执行 composer install 后重试。' ); } } /** * 将 Markdown 内容保存到存储 * * @param array $mediaFiles */ public function saveMarkdownToFile(Document $document, string $markdown, array $mediaFiles = []): string { $path = $this->generateMarkdownPath($document); $saved = Storage::disk('markdown')->put($path, $markdown); if (!$saved) { throw new \Exception('无法保存 Markdown 文件'); } $this->storeMarkdownMediaFiles(dirname($path), $mediaFiles); return $path; } /** * 为已存在的 Markdown 文档补齐缺失的图片资源 */ public function ensureMarkdownMediaAssets(Document $document): void { $this->ensureConversionDependenciesAvailable(); if (empty($document->markdown_path)) { return; } $markdown = $document->getMarkdownContent(); if (empty($markdown)) { return; } if (!preg_match_all('/!\[[^\]]*]\(((?:\.\/)?media\/[^)]+)\)/', $markdown, $matches)) { return; } $documentDir = dirname($document->markdown_path); $missingRefs = []; foreach ($matches[1] as $ref) { $relativePath = $this->normalizeMarkdownMediaPath($ref); if ($relativePath === null) { continue; } if (!Storage::disk('markdown')->exists($documentDir . '/' . $relativePath)) { $missingRefs[] = $relativePath; } } if ($missingRefs === []) { return; } $documentPath = Storage::disk('local')->path($document->file_path); if (!file_exists($documentPath)) { throw new \Exception("文档文件不存在: {$documentPath}"); } $doc = DocumentManager::open($documentPath, ['ocr' => false]); $mediaFiles = array_intersect_key( $this->extractMarkdownMediaFiles($doc), array_flip($missingRefs) ); $this->storeMarkdownMediaFiles($documentDir, $mediaFiles); } /** * 生成 Markdown 文件路径 */ protected function generateMarkdownPath(Document $document): string { $organizeByDate = config('documents.storage.organize_by_date', true); $uuid = Str::uuid()->toString(); if ($organizeByDate) { $date = $document->created_at ?? now(); $directory = $date->format('Y/m/d') . '/' . $uuid; } else { $directory = $uuid; } return "{$directory}/{$uuid}.md"; } /** * 获取 Markdown 内容的预览(前 N 个字符) */ public function getMarkdownPreview(string $markdown, ?int $length = null): string { $length = $length ?? $this->previewLength; $cleaned = preg_replace('/\s+/', ' ', $markdown); $cleaned = trim($cleaned); if (mb_strlen($cleaned) <= $length) { return $cleaned; } return mb_substr($cleaned, 0, $length) . '...'; } /** * 更新文档的 Markdown 信息 */ public function updateDocumentMarkdown(Document $document, string $markdownPath): void { $markdown = Storage::disk('markdown')->get($markdownPath); if ($markdown === false) { Log::warning('无法读取 Markdown 文件以生成预览', [ 'document_id' => $document->id, 'markdown_path' => $markdownPath, ]); } else { $this->getMarkdownPreview($markdown); } Document::withoutSyncingToSearch(function () use ($document, $markdownPath): void { $document->update([ 'markdown_path' => $markdownPath, 'conversion_status' => 'completed', 'conversion_error' => null, ]); }); } /** * 处理转换失败 */ public function handleConversionFailure(Document $document, \Exception $exception): void { Log::error('文档转换失败', [ 'document_id' => $document->id, 'document_title' => $document->title, 'file_name' => $document->file_name, 'error' => $exception->getMessage(), 'trace' => $exception->getTraceAsString(), ]); Document::withoutSyncingToSearch(function () use ($document, $exception): void { $document->update([ 'conversion_status' => 'failed', 'conversion_error' => $exception->getMessage(), ]); }); } /** * 将转换任务加入队列 */ public function queueConversion(Document $document): void { Document::withoutSyncingToSearch(function () use ($document): void { $document->update([ 'conversion_status' => 'processing', 'conversion_error' => null, ]); }); $queue = config('documents.conversion.queue', 'documents'); \App\Jobs\ConvertDocumentToMarkdown::dispatch($document)->onQueue($queue); } /** * @return array */ protected function extractMarkdownMediaFiles(DocumentInterface $document): array { $mediaFiles = []; $fallbackIndex = 1; foreach ($document->getSections() as $section) { foreach ($section->getElements() as $element) { if (!$element instanceof Image || !$element->hasData()) { continue; } $relativePath = $this->normalizeMarkdownMediaPath($element->getSrc()); if ($relativePath === null) { $relativePath = sprintf( 'media/image-%d.%s', $fallbackIndex++, $this->guessImageExtension($element) ); } $mediaFiles[$relativePath] = $element->getData(); } } return $mediaFiles; } /** * @param array $mediaFiles */ protected function storeMarkdownMediaFiles(string $documentDir, array $mediaFiles): void { foreach ($mediaFiles as $relativePath => $contents) { $targetPath = $documentDir . '/' . ltrim($relativePath, '/'); $targetDirectory = dirname($targetPath); if ($targetDirectory !== '.' && !Storage::disk('markdown')->exists($targetDirectory)) { Storage::disk('markdown')->makeDirectory($targetDirectory); } Storage::disk('markdown')->put($targetPath, $contents); } } protected function normalizeMarkdownMediaPath(string $path): ?string { $path = trim($path); if ($path === '') { return null; } if (str_contains($path, '://') || str_starts_with($path, 'data:')) { return null; } $path = preg_replace('/^\.?\//', '', $path) ?? $path; $path = str_replace('\\', '/', $path); $path = ltrim($path, '/'); if ($path === '' || !str_starts_with($path, 'media/')) { return null; } $segments = array_values(array_filter( explode('/', $path), fn (string $segment): bool => $segment !== '' && $segment !== '.' )); if ($segments === []) { return null; } foreach ($segments as $segment) { if ($segment === '..') { return null; } } return implode('/', $segments); } protected function guessImageExtension(Image $image): string { return match ($image->getMimeType()) { 'image/jpeg' => 'jpg', 'image/png' => 'png', 'image/gif' => 'gif', 'image/webp' => 'webp', 'image/bmp' => 'bmp', 'image/tiff' => 'tiff', 'image/svg+xml' => 'svg', default => pathinfo($image->getSrc(), PATHINFO_EXTENSION) ?: 'bin', }; } }