refactor: 重构知识库文件上传和处理, 支持 pdf
This commit is contained in:
@@ -4,33 +4,25 @@ namespace App\Services;
|
||||
|
||||
use App\Models\Document;
|
||||
use Illuminate\Support\Facades\Storage;
|
||||
use PhpOffice\PhpWord\IOFactory;
|
||||
use PhpOffice\PhpWord\Settings;
|
||||
|
||||
class DocumentPreviewService
|
||||
{
|
||||
/**
|
||||
* 将文档转换为 HTML 用于预览
|
||||
* 在 Filament 后台中,直接从 Word 转换以保证图片正确显示
|
||||
*
|
||||
* 将文档的 Markdown 内容转换为 HTML 用于预览
|
||||
* 统一用于 Filament 后台内联预览和独立预览页面
|
||||
*
|
||||
* @param Document $document
|
||||
* @return string HTML 内容
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function convertToHtml(Document $document): string
|
||||
{
|
||||
try {
|
||||
// 直接从 Word 转换,以确保图片正确显示
|
||||
// Markdown 转换的图片路径问题较复杂,暂时不使用
|
||||
return $this->convertWordToHtml($document);
|
||||
} catch (\Exception $e) {
|
||||
throw new \Exception('文档预览失败:' . $e->getMessage());
|
||||
}
|
||||
return $this->convertMarkdownToHtml($document);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将 Markdown 转换为 HTML(用于专门的 Markdown 预览页面)
|
||||
*
|
||||
* 将 Markdown 转换为 HTML
|
||||
*
|
||||
* @param Document $document
|
||||
* @return string HTML 内容
|
||||
* @throws \Exception
|
||||
@@ -38,15 +30,15 @@ class DocumentPreviewService
|
||||
public function convertMarkdownToHtml(Document $document): string
|
||||
{
|
||||
$markdownContent = $document->getMarkdownContent();
|
||||
|
||||
|
||||
if (empty($markdownContent)) {
|
||||
throw new \Exception('Markdown 内容为空');
|
||||
}
|
||||
|
||||
// 获取 Markdown 文件的目录(例如:2025/12/04)
|
||||
// 获取 Markdown 文件的目录
|
||||
$markdownDir = dirname($document->markdown_path);
|
||||
|
||||
// 修复图片路径:将 ./media/ 替换为 /markdown/{date}/media/
|
||||
// 修复图片路径:将 ./media/ 替换为 /markdown/{dir}/media/
|
||||
$markdownContent = preg_replace_callback(
|
||||
'/\(\.\/media\/([^)]+)\)/',
|
||||
function ($matches) use ($markdownDir) {
|
||||
@@ -58,250 +50,19 @@ class DocumentPreviewService
|
||||
|
||||
// 使用 MarkdownRenderService 转换为 HTML
|
||||
$renderService = app(MarkdownRenderService::class);
|
||||
$htmlContent = $renderService->render($markdownContent);
|
||||
|
||||
return $htmlContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* 直接从 Word 文档转换为 HTML
|
||||
*
|
||||
* @param Document $document
|
||||
* @return string HTML 内容
|
||||
* @throws \Exception
|
||||
*/
|
||||
protected function convertWordToHtml(Document $document): string
|
||||
{
|
||||
// 检查文件是否存在
|
||||
if (!Storage::disk('local')->exists($document->file_path)) {
|
||||
throw new \Exception('文档文件不存在');
|
||||
}
|
||||
|
||||
// 获取文件的完整路径
|
||||
$filePath = Storage::disk('local')->path($document->file_path);
|
||||
|
||||
// 确保临时目录存在并设置 PHPWord 的临时目录
|
||||
$tempDir = storage_path('app/temp');
|
||||
if (!is_dir($tempDir)) {
|
||||
mkdir($tempDir, 0755, true);
|
||||
}
|
||||
Settings::setTempDir($tempDir);
|
||||
|
||||
// 加载 Word 文档
|
||||
$phpWord = IOFactory::load($filePath);
|
||||
|
||||
// 提取图片并转换为 base64
|
||||
$images = $this->extractImagesFromDocument($phpWord);
|
||||
|
||||
// 创建 HTML Writer
|
||||
$htmlWriter = IOFactory::createWriter($phpWord, 'HTML');
|
||||
|
||||
// 使用 Laravel 存储系统创建临时文件
|
||||
$tempFileName = 'temp/doc_preview_' . uniqid() . '.html';
|
||||
|
||||
// 确保临时目录存在
|
||||
if (!Storage::disk('local')->exists('temp')) {
|
||||
Storage::disk('local')->makeDirectory('temp');
|
||||
}
|
||||
|
||||
$tempHtmlPath = Storage::disk('local')->path($tempFileName);
|
||||
$htmlWriter->save($tempHtmlPath);
|
||||
|
||||
// 读取 HTML 内容
|
||||
$htmlContent = Storage::disk('local')->get($tempFileName);
|
||||
|
||||
// 删除临时文件
|
||||
Storage::disk('local')->delete($tempFileName);
|
||||
|
||||
// 将图片嵌入为 base64
|
||||
$htmlContent = $this->embedImagesInHtml($htmlContent, $images);
|
||||
|
||||
// 清理和美化 HTML
|
||||
$htmlContent = $this->cleanHtml($htmlContent);
|
||||
|
||||
return $htmlContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 Word 文档中提取所有图片
|
||||
*
|
||||
* @param \PhpOffice\PhpWord\PhpWord $phpWord
|
||||
* @return array 图片数组,键为图片索引,值为 base64 编码的图片数据
|
||||
*/
|
||||
protected function extractImagesFromDocument($phpWord): array
|
||||
{
|
||||
$images = [];
|
||||
$imageIndex = 0;
|
||||
|
||||
foreach ($phpWord->getSections() as $section) {
|
||||
foreach ($section->getElements() as $element) {
|
||||
// 处理图片元素
|
||||
if (method_exists($element, 'getElements')) {
|
||||
foreach ($element->getElements() as $childElement) {
|
||||
if ($childElement instanceof \PhpOffice\PhpWord\Element\Image) {
|
||||
$imageSource = $childElement->getSource();
|
||||
if (file_exists($imageSource)) {
|
||||
$imageData = file_get_contents($imageSource);
|
||||
$imageType = $childElement->getImageType();
|
||||
$mimeType = $this->getImageMimeType($imageType);
|
||||
$base64 = base64_encode($imageData);
|
||||
$images[$imageIndex] = "data:{$mimeType};base64,{$base64}";
|
||||
$imageIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} elseif ($element instanceof \PhpOffice\PhpWord\Element\Image) {
|
||||
$imageSource = $element->getSource();
|
||||
if (file_exists($imageSource)) {
|
||||
$imageData = file_get_contents($imageSource);
|
||||
$imageType = $element->getImageType();
|
||||
$mimeType = $this->getImageMimeType($imageType);
|
||||
$base64 = base64_encode($imageData);
|
||||
$images[$imageIndex] = "data:{$mimeType};base64,{$base64}";
|
||||
$imageIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $images;
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据图片类型获取 MIME 类型
|
||||
*
|
||||
* @param string $imageType
|
||||
* @return string
|
||||
*/
|
||||
protected function getImageMimeType(string $imageType): string
|
||||
{
|
||||
$mimeTypes = [
|
||||
'jpg' => 'image/jpeg',
|
||||
'jpeg' => 'image/jpeg',
|
||||
'png' => 'image/png',
|
||||
'gif' => 'image/gif',
|
||||
'bmp' => 'image/bmp',
|
||||
'svg' => 'image/svg+xml',
|
||||
];
|
||||
|
||||
return $mimeTypes[strtolower($imageType)] ?? 'image/jpeg';
|
||||
}
|
||||
|
||||
/**
|
||||
* 将 HTML 中的图片替换为 base64 编码
|
||||
*
|
||||
* @param string $html
|
||||
* @param array $images
|
||||
* @return string
|
||||
*/
|
||||
protected function embedImagesInHtml(string $html, array $images): string
|
||||
{
|
||||
// PHPWord 生成的 HTML 中,图片通常以 <img src="..." /> 的形式存在
|
||||
// 我们需要将这些图片路径替换为 base64 数据
|
||||
|
||||
$imageIndex = 0;
|
||||
$html = preg_replace_callback(
|
||||
'/<img([^>]*?)src=["\']([^"\']+)["\']([^>]*?)>/i',
|
||||
function ($matches) use ($images, &$imageIndex) {
|
||||
$beforeSrc = $matches[1];
|
||||
$src = $matches[2];
|
||||
$afterSrc = $matches[3];
|
||||
|
||||
// 如果已经是 base64 或 http 链接,不处理
|
||||
if (strpos($src, 'data:') === 0 || strpos($src, 'http') === 0) {
|
||||
return $matches[0];
|
||||
}
|
||||
|
||||
// 使用提取的图片数据
|
||||
if (isset($images[$imageIndex])) {
|
||||
$src = $images[$imageIndex];
|
||||
$imageIndex++;
|
||||
}
|
||||
|
||||
return "<img{$beforeSrc}src=\"{$src}\"{$afterSrc}>";
|
||||
},
|
||||
$html
|
||||
);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 清理和美化 HTML 内容
|
||||
*
|
||||
* @param string $html
|
||||
* @return string
|
||||
*/
|
||||
protected function cleanHtml(string $html): string
|
||||
{
|
||||
// 提取 body 内容
|
||||
if (preg_match('/<body[^>]*>(.*?)<\/body>/is', $html, $matches)) {
|
||||
$html = $matches[1];
|
||||
}
|
||||
|
||||
// 添加基本样式
|
||||
$styledHtml = '<div class="document-preview" style="
|
||||
font-family: -apple-system, BlinkMacSystemFont, \'Segoe UI\', Roboto, \'Helvetica Neue\', Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
padding: 20px;
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
||||
">';
|
||||
|
||||
$styledHtml .= $html;
|
||||
$styledHtml .= '</div>';
|
||||
|
||||
return $styledHtml;
|
||||
return $renderService->render($markdownContent);
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查文档是否可以预览
|
||||
*
|
||||
*
|
||||
* @param Document $document
|
||||
* @return bool
|
||||
*/
|
||||
public function canPreview(Document $document): bool
|
||||
{
|
||||
// 检查文件扩展名
|
||||
$extension = strtolower(pathinfo($document->file_name, PATHINFO_EXTENSION));
|
||||
|
||||
// 目前支持 .doc 和 .docx
|
||||
return in_array($extension, ['doc', 'docx']);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文档预览的纯文本内容(用于搜索等)
|
||||
*
|
||||
* @param Document $document
|
||||
* @return string
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function extractText(Document $document): string
|
||||
{
|
||||
try {
|
||||
if (!Storage::disk('local')->exists($document->file_path)) {
|
||||
throw new \Exception('文档文件不存在');
|
||||
}
|
||||
|
||||
$filePath = Storage::disk('local')->path($document->file_path);
|
||||
$phpWord = IOFactory::load($filePath);
|
||||
|
||||
$text = '';
|
||||
foreach ($phpWord->getSections() as $section) {
|
||||
foreach ($section->getElements() as $element) {
|
||||
if (method_exists($element, 'getText')) {
|
||||
$text .= $element->getText() . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return trim($text);
|
||||
} catch (\Exception $e) {
|
||||
throw new \Exception('文本提取失败:' . $e->getMessage());
|
||||
}
|
||||
return $document->conversion_status === 'completed'
|
||||
&& !empty($document->markdown_path);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user