fix: 修复文档转换与预览链路中的图片、文件名和错误处理问题

This commit is contained in:
2026-04-24 10:40:29 +08:00
parent 37dd58eff0
commit e935afddfe
16 changed files with 451 additions and 69 deletions

View File

@@ -6,6 +6,8 @@ use App\Models\Document;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Str;
use Paperdoc\Contracts\DocumentInterface;
use Paperdoc\Document\Image;
use Paperdoc\Support\DocumentManager;
/**
@@ -23,9 +25,13 @@ class DocumentConversionService
/**
* 将文档转换为 Markdown
*
* @return array{markdown: string, media_files: array<string, string>}
*/
public function convertToMarkdown(Document $document): array
{
$this->ensureConversionDependenciesAvailable();
$documentPath = Storage::disk('local')->path($document->file_path);
if (!file_exists($documentPath)) {
@@ -39,24 +45,96 @@ class DocumentConversionService
throw new \Exception('文档转换后内容为空,可能是扫描件或不支持的内容格式');
}
return ['markdown' => $markdown];
return [
'markdown' => $markdown,
'media_files' => $this->extractMarkdownMediaFiles($doc),
];
}
/**
* 确保文档转换依赖已经安装
*/
protected function ensureConversionDependenciesAvailable(): void
{
if (!class_exists(DocumentManager::class)) {
throw new \RuntimeException(
'文档转换依赖未安装paperdoc-dev/paperdoc-lib。请执行 composer install 后重试。'
);
}
}
/**
* Markdown 内容保存到存储
*
* @param array<string, string> $mediaFiles
*/
public function saveMarkdownToFile(Document $document, string $markdown): string
public function saveMarkdownToFile(Document $document, string $markdown, array $mediaFiles = []): string
{
$path = $this->generateMarkdownPath($document);
$saved = Storage::disk('markdown')->put($path, $markdown);
if (!$saved) {
throw new \Exception("无法保存 Markdown 文件");
throw new \Exception('无法保存 Markdown 文件');
}
$this->storeMarkdownMediaFiles(dirname($path), $mediaFiles);
return $path;
}
/**
* 为已存在的 Markdown 文档补齐缺失的图片资源
*/
public function ensureMarkdownMediaAssets(Document $document): void
{
$this->ensureConversionDependenciesAvailable();
if (empty($document->markdown_path)) {
return;
}
$markdown = $document->getMarkdownContent();
if (empty($markdown)) {
return;
}
if (!preg_match_all('/!\[[^\]]*]\(((?:\.\/)?media\/[^)]+)\)/', $markdown, $matches)) {
return;
}
$documentDir = dirname($document->markdown_path);
$missingRefs = [];
foreach ($matches[1] as $ref) {
$relativePath = $this->normalizeMarkdownMediaPath($ref);
if ($relativePath === null) {
continue;
}
if (!Storage::disk('markdown')->exists($documentDir . '/' . $relativePath)) {
$missingRefs[] = $relativePath;
}
}
if ($missingRefs === []) {
return;
}
$documentPath = Storage::disk('local')->path($document->file_path);
if (!file_exists($documentPath)) {
throw new \Exception("文档文件不存在: {$documentPath}");
}
$doc = DocumentManager::open($documentPath, ['ocr' => false]);
$mediaFiles = array_intersect_key(
$this->extractMarkdownMediaFiles($doc),
array_flip($missingRefs)
);
$this->storeMarkdownMediaFiles($documentDir, $mediaFiles);
}
/**
* 生成 Markdown 文件路径
*/
@@ -103,16 +181,17 @@ class DocumentConversionService
'document_id' => $document->id,
'markdown_path' => $markdownPath,
]);
$preview = '';
} else {
$preview = $this->getMarkdownPreview($markdown);
$this->getMarkdownPreview($markdown);
}
$document->update([
'markdown_path' => $markdownPath,
'conversion_status' => 'completed',
'conversion_error' => null,
]);
Document::withoutSyncingToSearch(function () use ($document, $markdownPath): void {
$document->update([
'markdown_path' => $markdownPath,
'conversion_status' => 'completed',
'conversion_error' => null,
]);
});
}
/**
@@ -128,10 +207,12 @@ class DocumentConversionService
'trace' => $exception->getTraceAsString(),
]);
$document->update([
'conversion_status' => 'failed',
'conversion_error' => $exception->getMessage(),
]);
Document::withoutSyncingToSearch(function () use ($document, $exception): void {
$document->update([
'conversion_status' => 'failed',
'conversion_error' => $exception->getMessage(),
]);
});
}
/**
@@ -139,12 +220,113 @@ class DocumentConversionService
*/
public function queueConversion(Document $document): void
{
$document->update([
'conversion_status' => 'processing',
'conversion_error' => null,
]);
Document::withoutSyncingToSearch(function () use ($document): void {
$document->update([
'conversion_status' => 'processing',
'conversion_error' => null,
]);
});
$queue = config('documents.conversion.queue', 'documents');
\App\Jobs\ConvertDocumentToMarkdown::dispatch($document)->onQueue($queue);
}
/**
* @return array<string, string>
*/
protected function extractMarkdownMediaFiles(DocumentInterface $document): array
{
$mediaFiles = [];
$fallbackIndex = 1;
foreach ($document->getSections() as $section) {
foreach ($section->getElements() as $element) {
if (!$element instanceof Image || !$element->hasData()) {
continue;
}
$relativePath = $this->normalizeMarkdownMediaPath($element->getSrc());
if ($relativePath === null) {
$relativePath = sprintf(
'media/image-%d.%s',
$fallbackIndex++,
$this->guessImageExtension($element)
);
}
$mediaFiles[$relativePath] = $element->getData();
}
}
return $mediaFiles;
}
/**
* @param array<string, string> $mediaFiles
*/
protected function storeMarkdownMediaFiles(string $documentDir, array $mediaFiles): void
{
foreach ($mediaFiles as $relativePath => $contents) {
$targetPath = $documentDir . '/' . ltrim($relativePath, '/');
$targetDirectory = dirname($targetPath);
if ($targetDirectory !== '.' && !Storage::disk('markdown')->exists($targetDirectory)) {
Storage::disk('markdown')->makeDirectory($targetDirectory);
}
Storage::disk('markdown')->put($targetPath, $contents);
}
}
protected function normalizeMarkdownMediaPath(string $path): ?string
{
$path = trim($path);
if ($path === '') {
return null;
}
if (str_contains($path, '://') || str_starts_with($path, 'data:')) {
return null;
}
$path = preg_replace('/^\.?\//', '', $path) ?? $path;
$path = str_replace('\\', '/', $path);
$path = ltrim($path, '/');
if ($path === '' || !str_starts_with($path, 'media/')) {
return null;
}
$segments = array_values(array_filter(
explode('/', $path),
fn (string $segment): bool => $segment !== '' && $segment !== '.'
));
if ($segments === []) {
return null;
}
foreach ($segments as $segment) {
if ($segment === '..') {
return null;
}
}
return implode('/', $segments);
}
protected function guessImageExtension(Image $image): string
{
return match ($image->getMimeType()) {
'image/jpeg' => 'jpg',
'image/png' => 'png',
'image/gif' => 'gif',
'image/webp' => 'webp',
'image/bmp' => 'bmp',
'image/tiff' => 'tiff',
'image/svg+xml' => 'svg',
default => pathinfo($image->getSrc(), PATHINFO_EXTENSION) ?: 'bin',
};
}
}