KnowledgeBase/app/Services/DocumentPreviewService.php

<?php

namespace App\Services;

use App\Models\Document;
use Illuminate\Support\Facades\Storage;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\Settings;

class DocumentPreviewService
{
    /**
     * 将文档转换为 HTML 用于预览
     * 在 Filament 后台中，直接从 Word 转换以保证图片正确显示
     *
     * @param Document $document
     * @return string HTML 内容
     * @throws \Exception
     */
    public function convertToHtml(Document $document): string
    {
        try {
            // 直接从 Word 转换，以确保图片正确显示
            // Markdown 转换的图片路径问题较复杂，暂时不使用
            return $this->convertWordToHtml($document);
        } catch (\Exception $e) {
            throw new \Exception('文档预览失败：' . $e->getMessage());
        }
    }

    /**
     * 将 Markdown 转换为 HTML（用于专门的 Markdown 预览页面）
     *
     * @param Document $document
     * @return string HTML 内容
     * @throws \Exception
     */
    public function convertMarkdownToHtml(Document $document): string
    {
        $markdownContent = $document->getMarkdownContent();

        if (empty($markdownContent)) {
            throw new \Exception('Markdown 内容为空');
        }

        // 获取 Markdown 文件的目录（例如：2025/12/04）
        $markdownDir = dirname($document->markdown_path);

        // 修复图片路径：将 ./media/ 替换为 /markdown/{date}/media/
        $markdownContent = preg_replace_callback(
            '/\(\.\/media\/([^)]+)\)/',
            function ($matches) use ($markdownDir) {
                $filename = $matches[1];
                return '(/markdown/' . $markdownDir . '/media/' . $filename . ')';
            },
            $markdownContent
        );

        // 使用 MarkdownRenderService 转换为 HTML
        $renderService = app(MarkdownRenderService::class);
        $htmlContent = $renderService->render($markdownContent);

        return $htmlContent;
    }

    /**
     * 直接从 Word 文档转换为 HTML
     *
     * @param Document $document
     * @return string HTML 内容
     * @throws \Exception
     */
    protected function convertWordToHtml(Document $document): string
    {
        // 检查文件是否存在
        if (!Storage::disk('local')->exists($document->file_path)) {
            throw new \Exception('文档文件不存在');
        }

        // 获取文件的完整路径
        $filePath = Storage::disk('local')->path($document->file_path);

        // 设置 PHPWord 的临时目录
        Settings::setTempDir(storage_path('app/temp'));

        // 加载 Word 文档
        $phpWord = IOFactory::load($filePath);

        // 提取图片并转换为 base64
        $images = $this->extractImagesFromDocument($phpWord);

        // 创建 HTML Writer
        $htmlWriter = IOFactory::createWriter($phpWord, 'HTML');

        // 将内容写入临时文件
        $tempHtmlFile = tempnam(sys_get_temp_dir(), 'doc_preview_') . '.html';
        $htmlWriter->save($tempHtmlFile);

        // 读取 HTML 内容
        $htmlContent = file_get_contents($tempHtmlFile);

        // 删除临时文件
        unlink($tempHtmlFile);

        // 将图片嵌入为 base64
        $htmlContent = $this->embedImagesInHtml($htmlContent, $images);

        // 清理和美化 HTML
        $htmlContent = $this->cleanHtml($htmlContent);

        return $htmlContent;
    }

    /**
     * 从 Word 文档中提取所有图片
     *
     * @param \PhpOffice\PhpWord\PhpWord $phpWord
     * @return array 图片数组，键为图片索引，值为 base64 编码的图片数据
     */
    protected function extractImagesFromDocument($phpWord): array
    {
        $images = [];
        $imageIndex = 0;

        foreach ($phpWord->getSections() as $section) {
            foreach ($section->getElements() as $element) {
                // 处理图片元素
                if (method_exists($element, 'getElements')) {
                    foreach ($element->getElements() as $childElement) {
                        if ($childElement instanceof \PhpOffice\PhpWord\Element\Image) {
                            $imageSource = $childElement->getSource();
                            if (file_exists($imageSource)) {
                                $imageData = file_get_contents($imageSource);
                                $imageType = $childElement->getImageType();
                                $mimeType = $this->getImageMimeType($imageType);
                                $base64 = base64_encode($imageData);
                                $images[$imageIndex] = "data:{$mimeType};base64,{$base64}";
                                $imageIndex++;
                            }
                        }
                    }
                } elseif ($element instanceof \PhpOffice\PhpWord\Element\Image) {
                    $imageSource = $element->getSource();
                    if (file_exists($imageSource)) {
                        $imageData = file_get_contents($imageSource);
                        $imageType = $element->getImageType();
                        $mimeType = $this->getImageMimeType($imageType);
                        $base64 = base64_encode($imageData);
                        $images[$imageIndex] = "data:{$mimeType};base64,{$base64}";
                        $imageIndex++;
                    }
                }
            }
        }

        return $images;
    }

    /**
     * 根据图片类型获取 MIME 类型
     *
     * @param string $imageType
     * @return string
     */
    protected function getImageMimeType(string $imageType): string
    {
        $mimeTypes = [
            'jpg' => 'image/jpeg',
            'jpeg' => 'image/jpeg',
            'png' => 'image/png',
            'gif' => 'image/gif',
            'bmp' => 'image/bmp',
            'svg' => 'image/svg+xml',
        ];

        return $mimeTypes[strtolower($imageType)] ?? 'image/jpeg';
    }

    /**
     * 将 HTML 中的图片替换为 base64 编码
     *
     * @param string $html
     * @param array $images
     * @return string
     */
    protected function embedImagesInHtml(string $html, array $images): string
    {
        // PHPWord 生成的 HTML 中，图片通常以 <img src="..." /> 的形式存在
        // 我们需要将这些图片路径替换为 base64 数据

        $imageIndex = 0;
        $html = preg_replace_callback(
            '/<img([^>]*?)src=["\']([^"\']+)["\']([^>]*?)>/i',
            function ($matches) use ($images, &$imageIndex) {
                $beforeSrc = $matches[1];
                $src = $matches[2];
                $afterSrc = $matches[3];

                // 如果已经是 base64 或 http 链接，不处理
                if (strpos($src, 'data:') === 0 || strpos($src, 'http') === 0) {
                    return $matches[0];
                }

                // 使用提取的图片数据
                if (isset($images[$imageIndex])) {
                    $src = $images[$imageIndex];
                    $imageIndex++;
                }

                return "<img{$beforeSrc}src=\"{$src}\"{$afterSrc}>";
            },
            $html
        );

        return $html;
    }


    /**
     * 清理和美化 HTML 内容
     *
     * @param string $html
     * @return string
     */
    protected function cleanHtml(string $html): string
    {
        // 提取 body 内容
        if (preg_match('/<body[^>]*>(.*?)<\/body>/is', $html, $matches)) {
            $html = $matches[1];
        }

        // 添加基本样式
        $styledHtml = '<div class="document-preview" style="
            font-family: -apple-system, BlinkMacSystemFont, \'Segoe UI\', Roboto, \'Helvetica Neue\', Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            padding: 20px;
            background: white;
            border-radius: 8px;
            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
        ">';

        $styledHtml .= $html;
        $styledHtml .= '</div>';

        return $styledHtml;
    }

    /**
     * 检查文档是否可以预览
     *
     * @param Document $document
     * @return bool
     */
    public function canPreview(Document $document): bool
    {
        // 检查文件扩展名
        $extension = strtolower(pathinfo($document->file_name, PATHINFO_EXTENSION));

        // 目前支持 .doc 和 .docx
        return in_array($extension, ['doc', 'docx']);
    }

    /**
     * 获取文档预览的纯文本内容（用于搜索等）
     *
     * @param Document $document
     * @return string
     * @throws \Exception
     */
    public function extractText(Document $document): string
    {
        try {
            if (!Storage::disk('local')->exists($document->file_path)) {
                throw new \Exception('文档文件不存在');
            }

            $filePath = Storage::disk('local')->path($document->file_path);
            $phpWord = IOFactory::load($filePath);

            $text = '';
            foreach ($phpWord->getSections() as $section) {
                foreach ($section->getElements() as $element) {
                    if (method_exists($element, 'getText')) {
                        $text .= $element->getText() . "\n";
                    }
                }
            }

            return trim($text);
        } catch (\Exception $e) {
            throw new \Exception('文本提取失败：' . $e->getMessage());
        }
    }
}