333 lines
9.7 KiB
PHP
333 lines
9.7 KiB
PHP
<?php
|
||
|
||
namespace App\Services;
|
||
|
||
use App\Models\Document;
|
||
use Illuminate\Support\Facades\Log;
|
||
use Illuminate\Support\Facades\Storage;
|
||
use Illuminate\Support\Str;
|
||
use Paperdoc\Contracts\DocumentInterface;
|
||
use Paperdoc\Document\Image;
|
||
use Paperdoc\Support\DocumentManager;
|
||
|
||
/**
|
||
* 文档转换服务
|
||
* 使用 paperdoc-lib 将文档(DOCX/PPTX/XLSX/PDF)转换为 Markdown
|
||
*/
|
||
class DocumentConversionService
|
||
{
|
||
protected int $previewLength;
|
||
|
||
public function __construct()
|
||
{
|
||
$this->previewLength = config('documents.markdown.preview_length', 500);
|
||
}
|
||
|
||
/**
|
||
* 将文档转换为 Markdown
|
||
*
|
||
* @return array{markdown: string, media_files: array<string, string>}
|
||
*/
|
||
public function convertToMarkdown(Document $document): array
|
||
{
|
||
$this->ensureConversionDependenciesAvailable();
|
||
|
||
$documentPath = Storage::disk('local')->path($document->file_path);
|
||
|
||
if (!file_exists($documentPath)) {
|
||
throw new \Exception("文档文件不存在: {$documentPath}");
|
||
}
|
||
|
||
$doc = DocumentManager::open($documentPath, ['ocr' => false]);
|
||
$markdown = DocumentManager::renderAs($doc, 'md');
|
||
|
||
if (empty(trim($markdown))) {
|
||
throw new \Exception('文档转换后内容为空,可能是扫描件或不支持的内容格式');
|
||
}
|
||
|
||
return [
|
||
'markdown' => $markdown,
|
||
'media_files' => $this->extractMarkdownMediaFiles($doc),
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 确保文档转换依赖已经安装
|
||
*/
|
||
protected function ensureConversionDependenciesAvailable(): void
|
||
{
|
||
if (!class_exists(DocumentManager::class)) {
|
||
throw new \RuntimeException(
|
||
'文档转换依赖未安装:paperdoc-dev/paperdoc-lib。请执行 composer install 后重试。'
|
||
);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 将 Markdown 内容保存到存储
|
||
*
|
||
* @param array<string, string> $mediaFiles
|
||
*/
|
||
public function saveMarkdownToFile(Document $document, string $markdown, array $mediaFiles = []): string
|
||
{
|
||
$path = $this->generateMarkdownPath($document);
|
||
|
||
$saved = Storage::disk('markdown')->put($path, $markdown);
|
||
if (!$saved) {
|
||
throw new \Exception('无法保存 Markdown 文件');
|
||
}
|
||
|
||
$this->storeMarkdownMediaFiles(dirname($path), $mediaFiles);
|
||
|
||
return $path;
|
||
}
|
||
|
||
/**
|
||
* 为已存在的 Markdown 文档补齐缺失的图片资源
|
||
*/
|
||
public function ensureMarkdownMediaAssets(Document $document): void
|
||
{
|
||
$this->ensureConversionDependenciesAvailable();
|
||
|
||
if (empty($document->markdown_path)) {
|
||
return;
|
||
}
|
||
|
||
$markdown = $document->getMarkdownContent();
|
||
if (empty($markdown)) {
|
||
return;
|
||
}
|
||
|
||
if (!preg_match_all('/!\[[^\]]*]\(((?:\.\/)?media\/[^)]+)\)/', $markdown, $matches)) {
|
||
return;
|
||
}
|
||
|
||
$documentDir = dirname($document->markdown_path);
|
||
$missingRefs = [];
|
||
|
||
foreach ($matches[1] as $ref) {
|
||
$relativePath = $this->normalizeMarkdownMediaPath($ref);
|
||
|
||
if ($relativePath === null) {
|
||
continue;
|
||
}
|
||
|
||
if (!Storage::disk('markdown')->exists($documentDir . '/' . $relativePath)) {
|
||
$missingRefs[] = $relativePath;
|
||
}
|
||
}
|
||
|
||
if ($missingRefs === []) {
|
||
return;
|
||
}
|
||
|
||
$documentPath = Storage::disk('local')->path($document->file_path);
|
||
if (!file_exists($documentPath)) {
|
||
throw new \Exception("文档文件不存在: {$documentPath}");
|
||
}
|
||
|
||
$doc = DocumentManager::open($documentPath, ['ocr' => false]);
|
||
$mediaFiles = array_intersect_key(
|
||
$this->extractMarkdownMediaFiles($doc),
|
||
array_flip($missingRefs)
|
||
);
|
||
|
||
$this->storeMarkdownMediaFiles($documentDir, $mediaFiles);
|
||
}
|
||
|
||
/**
|
||
* 生成 Markdown 文件路径
|
||
*/
|
||
protected function generateMarkdownPath(Document $document): string
|
||
{
|
||
$organizeByDate = config('documents.storage.organize_by_date', true);
|
||
$uuid = Str::uuid()->toString();
|
||
|
||
if ($organizeByDate) {
|
||
$date = $document->created_at ?? now();
|
||
$directory = $date->format('Y/m/d') . '/' . $uuid;
|
||
} else {
|
||
$directory = $uuid;
|
||
}
|
||
|
||
return "{$directory}/{$uuid}.md";
|
||
}
|
||
|
||
/**
|
||
* 获取 Markdown 内容的预览(前 N 个字符)
|
||
*/
|
||
public function getMarkdownPreview(string $markdown, ?int $length = null): string
|
||
{
|
||
$length = $length ?? $this->previewLength;
|
||
$cleaned = preg_replace('/\s+/', ' ', $markdown);
|
||
$cleaned = trim($cleaned);
|
||
|
||
if (mb_strlen($cleaned) <= $length) {
|
||
return $cleaned;
|
||
}
|
||
|
||
return mb_substr($cleaned, 0, $length) . '...';
|
||
}
|
||
|
||
/**
|
||
* 更新文档的 Markdown 信息
|
||
*/
|
||
public function updateDocumentMarkdown(Document $document, string $markdownPath): void
|
||
{
|
||
$markdown = Storage::disk('markdown')->get($markdownPath);
|
||
|
||
if ($markdown === false) {
|
||
Log::warning('无法读取 Markdown 文件以生成预览', [
|
||
'document_id' => $document->id,
|
||
'markdown_path' => $markdownPath,
|
||
]);
|
||
} else {
|
||
$this->getMarkdownPreview($markdown);
|
||
}
|
||
|
||
Document::withoutSyncingToSearch(function () use ($document, $markdownPath): void {
|
||
$document->update([
|
||
'markdown_path' => $markdownPath,
|
||
'conversion_status' => 'completed',
|
||
'conversion_error' => null,
|
||
]);
|
||
});
|
||
}
|
||
|
||
/**
|
||
* 处理转换失败
|
||
*/
|
||
public function handleConversionFailure(Document $document, \Exception $exception): void
|
||
{
|
||
Log::error('文档转换失败', [
|
||
'document_id' => $document->id,
|
||
'document_title' => $document->title,
|
||
'file_name' => $document->file_name,
|
||
'error' => $exception->getMessage(),
|
||
'trace' => $exception->getTraceAsString(),
|
||
]);
|
||
|
||
Document::withoutSyncingToSearch(function () use ($document, $exception): void {
|
||
$document->update([
|
||
'conversion_status' => 'failed',
|
||
'conversion_error' => $exception->getMessage(),
|
||
]);
|
||
});
|
||
}
|
||
|
||
/**
|
||
* 将转换任务加入队列
|
||
*/
|
||
public function queueConversion(Document $document): void
|
||
{
|
||
Document::withoutSyncingToSearch(function () use ($document): void {
|
||
$document->update([
|
||
'conversion_status' => 'processing',
|
||
'conversion_error' => null,
|
||
]);
|
||
});
|
||
|
||
$queue = config('documents.conversion.queue', 'documents');
|
||
\App\Jobs\ConvertDocumentToMarkdown::dispatch($document)->onQueue($queue);
|
||
}
|
||
|
||
/**
|
||
* @return array<string, string>
|
||
*/
|
||
protected function extractMarkdownMediaFiles(DocumentInterface $document): array
|
||
{
|
||
$mediaFiles = [];
|
||
$fallbackIndex = 1;
|
||
|
||
foreach ($document->getSections() as $section) {
|
||
foreach ($section->getElements() as $element) {
|
||
if (!$element instanceof Image || !$element->hasData()) {
|
||
continue;
|
||
}
|
||
|
||
$relativePath = $this->normalizeMarkdownMediaPath($element->getSrc());
|
||
|
||
if ($relativePath === null) {
|
||
$relativePath = sprintf(
|
||
'media/image-%d.%s',
|
||
$fallbackIndex++,
|
||
$this->guessImageExtension($element)
|
||
);
|
||
}
|
||
|
||
$mediaFiles[$relativePath] = $element->getData();
|
||
}
|
||
}
|
||
|
||
return $mediaFiles;
|
||
}
|
||
|
||
/**
|
||
* @param array<string, string> $mediaFiles
|
||
*/
|
||
protected function storeMarkdownMediaFiles(string $documentDir, array $mediaFiles): void
|
||
{
|
||
foreach ($mediaFiles as $relativePath => $contents) {
|
||
$targetPath = $documentDir . '/' . ltrim($relativePath, '/');
|
||
$targetDirectory = dirname($targetPath);
|
||
|
||
if ($targetDirectory !== '.' && !Storage::disk('markdown')->exists($targetDirectory)) {
|
||
Storage::disk('markdown')->makeDirectory($targetDirectory);
|
||
}
|
||
|
||
Storage::disk('markdown')->put($targetPath, $contents);
|
||
}
|
||
}
|
||
|
||
protected function normalizeMarkdownMediaPath(string $path): ?string
|
||
{
|
||
$path = trim($path);
|
||
if ($path === '') {
|
||
return null;
|
||
}
|
||
|
||
if (str_contains($path, '://') || str_starts_with($path, 'data:')) {
|
||
return null;
|
||
}
|
||
|
||
$path = preg_replace('/^\.?\//', '', $path) ?? $path;
|
||
$path = str_replace('\\', '/', $path);
|
||
$path = ltrim($path, '/');
|
||
|
||
if ($path === '' || !str_starts_with($path, 'media/')) {
|
||
return null;
|
||
}
|
||
|
||
$segments = array_values(array_filter(
|
||
explode('/', $path),
|
||
fn (string $segment): bool => $segment !== '' && $segment !== '.'
|
||
));
|
||
|
||
if ($segments === []) {
|
||
return null;
|
||
}
|
||
|
||
foreach ($segments as $segment) {
|
||
if ($segment === '..') {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
return implode('/', $segments);
|
||
}
|
||
|
||
protected function guessImageExtension(Image $image): string
|
||
{
|
||
return match ($image->getMimeType()) {
|
||
'image/jpeg' => 'jpg',
|
||
'image/png' => 'png',
|
||
'image/gif' => 'gif',
|
||
'image/webp' => 'webp',
|
||
'image/bmp' => 'bmp',
|
||
'image/tiff' => 'tiff',
|
||
'image/svg+xml' => 'svg',
|
||
default => pathinfo($image->getSrc(), PATHINFO_EXTENSION) ?: 'bin',
|
||
};
|
||
}
|
||
}
|