Files
KnowledgeBase/app/Services/DocumentConversionService.php

333 lines
9.7 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace App\Services;
use App\Models\Document;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Str;
use Paperdoc\Contracts\DocumentInterface;
use Paperdoc\Document\Image;
use Paperdoc\Support\DocumentManager;
/**
* 文档转换服务
* 使用 paperdoc-lib 将文档DOCX/PPTX/XLSX/PDF转换为 Markdown
*/
class DocumentConversionService
{
protected int $previewLength;
public function __construct()
{
$this->previewLength = config('documents.markdown.preview_length', 500);
}
/**
* 将文档转换为 Markdown
*
* @return array{markdown: string, media_files: array<string, string>}
*/
public function convertToMarkdown(Document $document): array
{
$this->ensureConversionDependenciesAvailable();
$documentPath = Storage::disk('local')->path($document->file_path);
if (!file_exists($documentPath)) {
throw new \Exception("文档文件不存在: {$documentPath}");
}
$doc = DocumentManager::open($documentPath, ['ocr' => false]);
$markdown = DocumentManager::renderAs($doc, 'md');
if (empty(trim($markdown))) {
throw new \Exception('文档转换后内容为空,可能是扫描件或不支持的内容格式');
}
return [
'markdown' => $markdown,
'media_files' => $this->extractMarkdownMediaFiles($doc),
];
}
/**
* 确保文档转换依赖已经安装
*/
protected function ensureConversionDependenciesAvailable(): void
{
if (!class_exists(DocumentManager::class)) {
throw new \RuntimeException(
'文档转换依赖未安装paperdoc-dev/paperdoc-lib。请执行 composer install 后重试。'
);
}
}
/**
* 将 Markdown 内容保存到存储
*
* @param array<string, string> $mediaFiles
*/
public function saveMarkdownToFile(Document $document, string $markdown, array $mediaFiles = []): string
{
$path = $this->generateMarkdownPath($document);
$saved = Storage::disk('markdown')->put($path, $markdown);
if (!$saved) {
throw new \Exception('无法保存 Markdown 文件');
}
$this->storeMarkdownMediaFiles(dirname($path), $mediaFiles);
return $path;
}
/**
* 为已存在的 Markdown 文档补齐缺失的图片资源
*/
public function ensureMarkdownMediaAssets(Document $document): void
{
$this->ensureConversionDependenciesAvailable();
if (empty($document->markdown_path)) {
return;
}
$markdown = $document->getMarkdownContent();
if (empty($markdown)) {
return;
}
if (!preg_match_all('/!\[[^\]]*]\(((?:\.\/)?media\/[^)]+)\)/', $markdown, $matches)) {
return;
}
$documentDir = dirname($document->markdown_path);
$missingRefs = [];
foreach ($matches[1] as $ref) {
$relativePath = $this->normalizeMarkdownMediaPath($ref);
if ($relativePath === null) {
continue;
}
if (!Storage::disk('markdown')->exists($documentDir . '/' . $relativePath)) {
$missingRefs[] = $relativePath;
}
}
if ($missingRefs === []) {
return;
}
$documentPath = Storage::disk('local')->path($document->file_path);
if (!file_exists($documentPath)) {
throw new \Exception("文档文件不存在: {$documentPath}");
}
$doc = DocumentManager::open($documentPath, ['ocr' => false]);
$mediaFiles = array_intersect_key(
$this->extractMarkdownMediaFiles($doc),
array_flip($missingRefs)
);
$this->storeMarkdownMediaFiles($documentDir, $mediaFiles);
}
/**
* 生成 Markdown 文件路径
*/
protected function generateMarkdownPath(Document $document): string
{
$organizeByDate = config('documents.storage.organize_by_date', true);
$uuid = Str::uuid()->toString();
if ($organizeByDate) {
$date = $document->created_at ?? now();
$directory = $date->format('Y/m/d') . '/' . $uuid;
} else {
$directory = $uuid;
}
return "{$directory}/{$uuid}.md";
}
/**
* 获取 Markdown 内容的预览(前 N 个字符)
*/
public function getMarkdownPreview(string $markdown, ?int $length = null): string
{
$length = $length ?? $this->previewLength;
$cleaned = preg_replace('/\s+/', ' ', $markdown);
$cleaned = trim($cleaned);
if (mb_strlen($cleaned) <= $length) {
return $cleaned;
}
return mb_substr($cleaned, 0, $length) . '...';
}
/**
* 更新文档的 Markdown 信息
*/
public function updateDocumentMarkdown(Document $document, string $markdownPath): void
{
$markdown = Storage::disk('markdown')->get($markdownPath);
if ($markdown === false) {
Log::warning('无法读取 Markdown 文件以生成预览', [
'document_id' => $document->id,
'markdown_path' => $markdownPath,
]);
} else {
$this->getMarkdownPreview($markdown);
}
Document::withoutSyncingToSearch(function () use ($document, $markdownPath): void {
$document->update([
'markdown_path' => $markdownPath,
'conversion_status' => 'completed',
'conversion_error' => null,
]);
});
}
/**
* 处理转换失败
*/
public function handleConversionFailure(Document $document, \Exception $exception): void
{
Log::error('文档转换失败', [
'document_id' => $document->id,
'document_title' => $document->title,
'file_name' => $document->file_name,
'error' => $exception->getMessage(),
'trace' => $exception->getTraceAsString(),
]);
Document::withoutSyncingToSearch(function () use ($document, $exception): void {
$document->update([
'conversion_status' => 'failed',
'conversion_error' => $exception->getMessage(),
]);
});
}
/**
* 将转换任务加入队列
*/
public function queueConversion(Document $document): void
{
Document::withoutSyncingToSearch(function () use ($document): void {
$document->update([
'conversion_status' => 'processing',
'conversion_error' => null,
]);
});
$queue = config('documents.conversion.queue', 'documents');
\App\Jobs\ConvertDocumentToMarkdown::dispatch($document)->onQueue($queue);
}
/**
* @return array<string, string>
*/
protected function extractMarkdownMediaFiles(DocumentInterface $document): array
{
$mediaFiles = [];
$fallbackIndex = 1;
foreach ($document->getSections() as $section) {
foreach ($section->getElements() as $element) {
if (!$element instanceof Image || !$element->hasData()) {
continue;
}
$relativePath = $this->normalizeMarkdownMediaPath($element->getSrc());
if ($relativePath === null) {
$relativePath = sprintf(
'media/image-%d.%s',
$fallbackIndex++,
$this->guessImageExtension($element)
);
}
$mediaFiles[$relativePath] = $element->getData();
}
}
return $mediaFiles;
}
/**
* @param array<string, string> $mediaFiles
*/
protected function storeMarkdownMediaFiles(string $documentDir, array $mediaFiles): void
{
foreach ($mediaFiles as $relativePath => $contents) {
$targetPath = $documentDir . '/' . ltrim($relativePath, '/');
$targetDirectory = dirname($targetPath);
if ($targetDirectory !== '.' && !Storage::disk('markdown')->exists($targetDirectory)) {
Storage::disk('markdown')->makeDirectory($targetDirectory);
}
Storage::disk('markdown')->put($targetPath, $contents);
}
}
protected function normalizeMarkdownMediaPath(string $path): ?string
{
$path = trim($path);
if ($path === '') {
return null;
}
if (str_contains($path, '://') || str_starts_with($path, 'data:')) {
return null;
}
$path = preg_replace('/^\.?\//', '', $path) ?? $path;
$path = str_replace('\\', '/', $path);
$path = ltrim($path, '/');
if ($path === '' || !str_starts_with($path, 'media/')) {
return null;
}
$segments = array_values(array_filter(
explode('/', $path),
fn (string $segment): bool => $segment !== '' && $segment !== '.'
));
if ($segments === []) {
return null;
}
foreach ($segments as $segment) {
if ($segment === '..') {
return null;
}
}
return implode('/', $segments);
}
protected function guessImageExtension(Image $image): string
{
return match ($image->getMimeType()) {
'image/jpeg' => 'jpg',
'image/png' => 'png',
'image/gif' => 'gif',
'image/webp' => 'webp',
'image/bmp' => 'bmp',
'image/tiff' => 'tiff',
'image/svg+xml' => 'svg',
default => pathinfo($image->getSrc(), PATHINFO_EXTENSION) ?: 'bin',
};
}
}