refactor: 重构知识库文件上传和处理, 支持 pdf

2026-03-23 16:30:13 +08:00
parent 89af7c17f1
commit 63ea2686e1
17 changed files with 905 additions and 1782 deletions
--- a/app/Services/DocumentPreviewService.php
+++ b/app/Services/DocumentPreviewService.php
@@ -4,33 +4,25 @@ namespace App\Services;

 use App\Models\Document;
 use Illuminate\Support\Facades\Storage;
-use PhpOffice\PhpWord\IOFactory;
-use PhpOffice\PhpWord\Settings;

 class DocumentPreviewService
 {
    /**
-     * 将文档转换为 HTML 用于预览
-     * 在 Filament 后台中，直接从 Word 转换以保证图片正确显示
-     * 
+     * 将文档的 Markdown 内容转换为 HTML 用于预览
+     * 统一用于 Filament 后台内联预览和独立预览页面
+     *
     * @param Document $document
     * @return string HTML 内容
     * @throws \Exception
     */
    public function convertToHtml(Document $document): string
    {
-        try {
-            // 直接从 Word 转换，以确保图片正确显示
-            // Markdown 转换的图片路径问题较复杂，暂时不使用
-            return $this->convertWordToHtml($document);
-        } catch (\Exception $e) {
-            throw new \Exception('文档预览失败：' . $e->getMessage());
-        }
+        return $this->convertMarkdownToHtml($document);
    }

    /**
-     * 将 Markdown 转换为 HTML（用于专门的 Markdown 预览页面）
-     * 
+     * 将 Markdown 转换为 HTML
+     *
     * @param Document $document
     * @return string HTML 内容
     * @throws \Exception
@@ -38,15 +30,15 @@ class DocumentPreviewService
    public function convertMarkdownToHtml(Document $document): string
    {
        $markdownContent = $document->getMarkdownContent();
-        
+
        if (empty($markdownContent)) {
            throw new \Exception('Markdown 内容为空');
        }

-        // 获取 Markdown 文件的目录（例如：2025/12/04）
+        // 获取 Markdown 文件的目录
        $markdownDir = dirname($document->markdown_path);

-        // 修复图片路径：将 ./media/ 替换为 /markdown/{date}/media/
+        // 修复图片路径：将 ./media/ 替换为 /markdown/{dir}/media/
        $markdownContent = preg_replace_callback(
            '/\(\.\/media\/([^)]+)\)/',
            function ($matches) use ($markdownDir) {
@@ -58,250 +50,19 @@ class DocumentPreviewService

        // 使用 MarkdownRenderService 转换为 HTML
        $renderService = app(MarkdownRenderService::class);
-        $htmlContent = $renderService->render($markdownContent);

-        return $htmlContent;
-    }
-
-    /**
-     * 直接从 Word 文档转换为 HTML
-     * 
-     * @param Document $document
-     * @return string HTML 内容
-     * @throws \Exception
-     */
-    protected function convertWordToHtml(Document $document): string
-    {
-        // 检查文件是否存在
-        if (!Storage::disk('local')->exists($document->file_path)) {
-            throw new \Exception('文档文件不存在');
-        }
-
-        // 获取文件的完整路径
-        $filePath = Storage::disk('local')->path($document->file_path);
-
-        // 确保临时目录存在并设置 PHPWord 的临时目录
-        $tempDir = storage_path('app/temp');
-        if (!is_dir($tempDir)) {
-            mkdir($tempDir, 0755, true);
-        }
-        Settings::setTempDir($tempDir);
-
-        // 加载 Word 文档
-        $phpWord = IOFactory::load($filePath);
-
-        // 提取图片并转换为 base64
-        $images = $this->extractImagesFromDocument($phpWord);
-
-        // 创建 HTML Writer
-        $htmlWriter = IOFactory::createWriter($phpWord, 'HTML');
-
-        // 使用 Laravel 存储系统创建临时文件
-        $tempFileName = 'temp/doc_preview_' . uniqid() . '.html';
-        
-        // 确保临时目录存在
-        if (!Storage::disk('local')->exists('temp')) {
-            Storage::disk('local')->makeDirectory('temp');
-        }
-        
-        $tempHtmlPath = Storage::disk('local')->path($tempFileName);
-        $htmlWriter->save($tempHtmlPath);
-
-        // 读取 HTML 内容
-        $htmlContent = Storage::disk('local')->get($tempFileName);
-
-        // 删除临时文件
-        Storage::disk('local')->delete($tempFileName);
-
-        // 将图片嵌入为 base64
-        $htmlContent = $this->embedImagesInHtml($htmlContent, $images);
-
-        // 清理和美化 HTML
-        $htmlContent = $this->cleanHtml($htmlContent);
-
-        return $htmlContent;
-    }
-
-    /**
-     * 从 Word 文档中提取所有图片
-     * 
-     * @param \PhpOffice\PhpWord\PhpWord $phpWord
-     * @return array 图片数组，键为图片索引，值为 base64 编码的图片数据
-     */
-    protected function extractImagesFromDocument($phpWord): array
-    {
-        $images = [];
-        $imageIndex = 0;
-
-        foreach ($phpWord->getSections() as $section) {
-            foreach ($section->getElements() as $element) {
-                // 处理图片元素
-                if (method_exists($element, 'getElements')) {
-                    foreach ($element->getElements() as $childElement) {
-                        if ($childElement instanceof \PhpOffice\PhpWord\Element\Image) {
-                            $imageSource = $childElement->getSource();
-                            if (file_exists($imageSource)) {
-                                $imageData = file_get_contents($imageSource);
-                                $imageType = $childElement->getImageType();
-                                $mimeType = $this->getImageMimeType($imageType);
-                                $base64 = base64_encode($imageData);
-                                $images[$imageIndex] = "data:{$mimeType};base64,{$base64}";
-                                $imageIndex++;
-                            }
-                        }
-                    }
-                } elseif ($element instanceof \PhpOffice\PhpWord\Element\Image) {
-                    $imageSource = $element->getSource();
-                    if (file_exists($imageSource)) {
-                        $imageData = file_get_contents($imageSource);
-                        $imageType = $element->getImageType();
-                        $mimeType = $this->getImageMimeType($imageType);
-                        $base64 = base64_encode($imageData);
-                        $images[$imageIndex] = "data:{$mimeType};base64,{$base64}";
-                        $imageIndex++;
-                    }
-                }
-            }
-        }
-
-        return $images;
-    }
-
-    /**
-     * 根据图片类型获取 MIME 类型
-     * 
-     * @param string $imageType
-     * @return string
-     */
-    protected function getImageMimeType(string $imageType): string
-    {
-        $mimeTypes = [
-            'jpg' => 'image/jpeg',
-            'jpeg' => 'image/jpeg',
-            'png' => 'image/png',
-            'gif' => 'image/gif',
-            'bmp' => 'image/bmp',
-            'svg' => 'image/svg+xml',
-        ];
-
-        return $mimeTypes[strtolower($imageType)] ?? 'image/jpeg';
-    }
-
-    /**
-     * 将 HTML 中的图片替换为 base64 编码
-     * 
-     * @param string $html
-     * @param array $images
-     * @return string
-     */
-    protected function embedImagesInHtml(string $html, array $images): string
-    {
-        // PHPWord 生成的 HTML 中，图片通常以 <img src="..." /> 的形式存在
-        // 我们需要将这些图片路径替换为 base64 数据
-        
-        $imageIndex = 0;
-        $html = preg_replace_callback(
-            '/<img([^>]*?)src=["\']([^"\']+)["\']([^>]*?)>/i',
-            function ($matches) use ($images, &$imageIndex) {
-                $beforeSrc = $matches[1];
-                $src = $matches[2];
-                $afterSrc = $matches[3];
-
-                // 如果已经是 base64 或 http 链接，不处理
-                if (strpos($src, 'data:') === 0 || strpos($src, 'http') === 0) {
-                    return $matches[0];
-                }
-
-                // 使用提取的图片数据
-                if (isset($images[$imageIndex])) {
-                    $src = $images[$imageIndex];
-                    $imageIndex++;
-                }
-
-                return "<img{$beforeSrc}src=\"{$src}\"{$afterSrc}>";
-            },
-            $html
-        );
-
-        return $html;
-    }
-
-
-
-    /**
-     * 清理和美化 HTML 内容
-     * 
-     * @param string $html
-     * @return string
-     */
-    protected function cleanHtml(string $html): string
-    {
-        // 提取 body 内容
-        if (preg_match('/<body[^>]*>(.*?)<\/body>/is', $html, $matches)) {
-            $html = $matches[1];
-        }
-
-        // 添加基本样式
-        $styledHtml = '<div class="document-preview" style="
-            font-family: -apple-system, BlinkMacSystemFont, \'Segoe UI\', Roboto, \'Helvetica Neue\', Arial, sans-serif;
-            line-height: 1.6;
-            color: #333;
-            padding: 20px;
-            background: white;
-            border-radius: 8px;
-            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
-        ">';
-        
-        $styledHtml .= $html;
-        $styledHtml .= '</div>';
-
-        return $styledHtml;
+        return $renderService->render($markdownContent);
    }

    /**
     * 检查文档是否可以预览
-     * 
+     *
     * @param Document $document
     * @return bool
     */
    public function canPreview(Document $document): bool
    {
-        // 检查文件扩展名
-        $extension = strtolower(pathinfo($document->file_name, PATHINFO_EXTENSION));
-        
-        // 目前支持 .doc 和 .docx
-        return in_array($extension, ['doc', 'docx']);
-    }
-
-    /**
-     * 获取文档预览的纯文本内容（用于搜索等）
-     * 
-     * @param Document $document
-     * @return string
-     * @throws \Exception
-     */
-    public function extractText(Document $document): string
-    {
-        try {
-            if (!Storage::disk('local')->exists($document->file_path)) {
-                throw new \Exception('文档文件不存在');
-            }
-
-            $filePath = Storage::disk('local')->path($document->file_path);
-            $phpWord = IOFactory::load($filePath);
-
-            $text = '';
-            foreach ($phpWord->getSections() as $section) {
-                foreach ($section->getElements() as $element) {
-                    if (method_exists($element, 'getText')) {
-                        $text .= $element->getText() . "\n";
-                    }
-                }
-            }
-
-            return trim($text);
-        } catch (\Exception $e) {
-            throw new \Exception('文本提取失败：' . $e->getMessage());
-        }
+        return $document->conversion_status === 'completed'
+            && !empty($document->markdown_path);
    }
 }