fix: fix the mineru call
This commit is contained in:
+20
-3
@@ -94,6 +94,12 @@ def _build_parser() -> argparse.ArgumentParser:
|
|||||||
)
|
)
|
||||||
convert_parser.add_argument("--library", "-L", default=".", help="Library root")
|
convert_parser.add_argument("--library", "-L", default=".", help="Library root")
|
||||||
convert_parser.add_argument("--paper-id", help="Convert specific paper by ID")
|
convert_parser.add_argument("--paper-id", help="Convert specific paper by ID")
|
||||||
|
convert_parser.add_argument(
|
||||||
|
"--retry-failed", action="store_true", help="Retry failed conversions"
|
||||||
|
)
|
||||||
|
convert_parser.add_argument(
|
||||||
|
"--force", action="store_true", help="Force reconvert successful papers"
|
||||||
|
)
|
||||||
convert_parser.set_defaults(handler=_handle_convert)
|
convert_parser.set_defaults(handler=_handle_convert)
|
||||||
|
|
||||||
# Reindex command
|
# Reindex command
|
||||||
@@ -322,9 +328,20 @@ def _handle_convert(args: argparse.Namespace) -> int:
|
|||||||
print(f"Paper not found: {args.paper_id}")
|
print(f"Paper not found: {args.paper_id}")
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
# Convert all pending papers
|
# Convert papers based on flags
|
||||||
success_count, failure_count = converter.convert_all_pending()
|
success_count, failure_count = converter.convert_all_pending(
|
||||||
msg = f"Complete: {success_count} successful, {failure_count} failed"
|
retry_failed=args.retry_failed, force=args.force
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show what was attempted
|
||||||
|
if args.force:
|
||||||
|
action = "Force converted"
|
||||||
|
elif args.retry_failed:
|
||||||
|
action = "Converted pending and retried failed"
|
||||||
|
else:
|
||||||
|
action = "Converted pending"
|
||||||
|
|
||||||
|
msg = f"{action}: {success_count} successful, {failure_count} failed"
|
||||||
print(msg)
|
print(msg)
|
||||||
return 0 if failure_count == 0 else 1
|
return 0 if failure_count == 0 else 1
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,17 @@ class MinerUConverter:
|
|||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def is_mineru_available(self) -> bool:
|
def is_mineru_available(self) -> bool:
|
||||||
"""Check if MinerU is available in the environment."""
|
"""Check if MinerU CLI is available in the environment."""
|
||||||
|
try:
|
||||||
|
# Check if mineru command is available
|
||||||
|
result = subprocess.run(
|
||||||
|
["mineru", "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
return result.returncode == 0
|
||||||
|
except (subprocess.SubprocessError, FileNotFoundError):
|
||||||
|
# Fallback: check if mineru module is importable
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[sys.executable, "-c", "import mineru"],
|
[sys.executable, "-c", "import mineru"],
|
||||||
@@ -52,18 +62,23 @@ class MinerUConverter:
|
|||||||
self.storage_manager.update_paper_metadata(metadata)
|
self.storage_manager.update_paper_metadata(metadata)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Create temporary output directory in cache
|
||||||
|
cache_dir = self.storage_manager.library_paths.cache_dir
|
||||||
|
temp_output_dir = cache_dir / f"mineru_temp_{metadata.paper_id}"
|
||||||
|
temp_output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
# Run MinerU conversion
|
# Run MinerU conversion
|
||||||
log_file = logs_dir / "mineru.log"
|
log_file = logs_dir / "mineru.log"
|
||||||
|
|
||||||
# MinerU command
|
# Correct MinerU command
|
||||||
cmd = [
|
cmd = [
|
||||||
sys.executable,
|
"mineru",
|
||||||
"-m",
|
"-p",
|
||||||
"magic_pdf.pipe.UNIPipe",
|
|
||||||
"--pdf",
|
|
||||||
str(pdf_path),
|
str(pdf_path),
|
||||||
"--output-dir",
|
"-o",
|
||||||
str(paths["directory"]),
|
str(temp_output_dir),
|
||||||
|
"-b",
|
||||||
|
"pipeline", # CPU-only mode for compatibility
|
||||||
]
|
]
|
||||||
|
|
||||||
self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}")
|
self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}")
|
||||||
@@ -73,24 +88,30 @@ class MinerUConverter:
|
|||||||
cmd,
|
cmd,
|
||||||
stdout=log,
|
stdout=log,
|
||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
cwd=paths["directory"],
|
|
||||||
check=False,
|
check=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if conversion was successful
|
# Check if conversion was successful
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
# MinerU typically outputs markdown files, try to find the main one
|
# MinerU outputs to <output_dir>/<filename>/
|
||||||
# Look for common output patterns
|
pdf_stem = pdf_path.stem # Get filename without .pdf extension
|
||||||
markdown_candidates = list(paths["directory"].glob("*.md"))
|
mineru_output_dir = temp_output_dir / pdf_stem
|
||||||
if not markdown_candidates:
|
expected_markdown = mineru_output_dir / f"{pdf_stem}.md"
|
||||||
# Try subdirectories
|
expected_images = mineru_output_dir / "images"
|
||||||
markdown_candidates = list(paths["directory"].rglob("*.md"))
|
|
||||||
|
|
||||||
if markdown_candidates:
|
if expected_markdown.exists():
|
||||||
# Use the first markdown file found, or rename if needed
|
# Move markdown file to paper directory
|
||||||
main_md = markdown_candidates[0]
|
expected_markdown.rename(markdown_path)
|
||||||
if main_md != markdown_path:
|
|
||||||
main_md.rename(markdown_path)
|
# Move images directory if it exists
|
||||||
|
if expected_images.exists():
|
||||||
|
assets_target = paths["assets"]
|
||||||
|
if assets_target.exists():
|
||||||
|
# Remove existing assets directory
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
shutil.rmtree(assets_target)
|
||||||
|
expected_images.rename(assets_target)
|
||||||
|
|
||||||
# Update metadata
|
# Update metadata
|
||||||
metadata.conversion_status = ConversionStatus.SUCCESS
|
metadata.conversion_status = ConversionStatus.SUCCESS
|
||||||
@@ -99,9 +120,16 @@ class MinerUConverter:
|
|||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Successfully converted {pdf_path} to {markdown_path}"
|
f"Successfully converted {pdf_path} to {markdown_path}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Clean up temporary directory
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
shutil.rmtree(temp_output_dir)
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
self.logger.error("No markdown output found after conversion")
|
self.logger.error(
|
||||||
|
f"Expected markdown file not found: {expected_markdown}"
|
||||||
|
)
|
||||||
metadata.conversion_status = ConversionStatus.FAILED
|
metadata.conversion_status = ConversionStatus.FAILED
|
||||||
self.storage_manager.update_paper_metadata(metadata)
|
self.storage_manager.update_paper_metadata(metadata)
|
||||||
return False
|
return False
|
||||||
@@ -118,14 +146,34 @@ class MinerUConverter:
|
|||||||
metadata.conversion_status = ConversionStatus.FAILED
|
metadata.conversion_status = ConversionStatus.FAILED
|
||||||
self.storage_manager.update_paper_metadata(metadata)
|
self.storage_manager.update_paper_metadata(metadata)
|
||||||
return False
|
return False
|
||||||
|
finally:
|
||||||
|
# Ensure cleanup of temp directory
|
||||||
|
if "temp_output_dir" in locals() and temp_output_dir.exists():
|
||||||
|
import shutil
|
||||||
|
|
||||||
def convert_all_pending(self) -> tuple[int, int]:
|
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
||||||
"""Convert all papers with pending conversion status."""
|
|
||||||
|
def convert_all_pending(
|
||||||
|
self, retry_failed: bool = False, force: bool = False
|
||||||
|
) -> tuple[int, int]:
|
||||||
|
"""Convert papers based on their conversion status."""
|
||||||
success_count = 0
|
success_count = 0
|
||||||
failure_count = 0
|
failure_count = 0
|
||||||
|
|
||||||
for metadata in self.storage_manager.list_all_papers():
|
for metadata in self.storage_manager.list_all_papers():
|
||||||
if metadata.conversion_status == ConversionStatus.PENDING:
|
should_convert = False
|
||||||
|
|
||||||
|
if force:
|
||||||
|
# Force convert all papers
|
||||||
|
should_convert = True
|
||||||
|
elif metadata.conversion_status == ConversionStatus.PENDING:
|
||||||
|
# Convert pending papers
|
||||||
|
should_convert = True
|
||||||
|
elif retry_failed and metadata.conversion_status == ConversionStatus.FAILED:
|
||||||
|
# Retry failed conversions if requested
|
||||||
|
should_convert = True
|
||||||
|
|
||||||
|
if should_convert:
|
||||||
if self.convert_paper(metadata):
|
if self.convert_paper(metadata):
|
||||||
success_count += 1
|
success_count += 1
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -0,0 +1,233 @@
|
|||||||
|
"""Tests for paperlib PDF converter."""
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperlib.config import LibraryPaths
|
||||||
|
from paperlib.converter import MinerUConverter
|
||||||
|
from paperlib.models import ConversionStatus, PaperMetadata, SourceType
|
||||||
|
from paperlib.storage import PaperStorageManager
|
||||||
|
|
||||||
|
|
||||||
|
class TestMinerUConverter:
|
||||||
|
"""Test MinerUConverter functionality."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_library(self):
|
||||||
|
"""Create a temporary library for testing."""
|
||||||
|
temp_dir = Path("./.tmp") / f"test_converter_{hash(self)}"
|
||||||
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
library_paths = LibraryPaths.from_root(temp_dir)
|
||||||
|
library_paths.create_directories()
|
||||||
|
|
||||||
|
yield library_paths
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
if temp_dir.exists():
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def storage_manager(self, temp_library):
|
||||||
|
"""Create a storage manager for testing."""
|
||||||
|
return PaperStorageManager(temp_library)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def converter(self, storage_manager):
|
||||||
|
"""Create a MinerUConverter for testing."""
|
||||||
|
return MinerUConverter(storage_manager)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_metadata(self, storage_manager):
|
||||||
|
"""Create sample paper metadata for testing."""
|
||||||
|
# Create a sample PDF file
|
||||||
|
pdf_file = Path("./.tmp") / f"test_convert_{hash(self)}.pdf"
|
||||||
|
with pdf_file.open("wb") as f:
|
||||||
|
f.write(b"%PDF-1.4\n")
|
||||||
|
f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
|
||||||
|
f.write(b"%%EOF\n")
|
||||||
|
|
||||||
|
# Store the paper
|
||||||
|
metadata = storage_manager.store_paper(
|
||||||
|
pdf_path=pdf_file,
|
||||||
|
source_type=SourceType.LOCAL,
|
||||||
|
title="Test Paper for Conversion",
|
||||||
|
)
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_is_mineru_available_cli(self, mock_run, converter):
|
||||||
|
"""Test MinerU availability check using CLI."""
|
||||||
|
# Mock successful mineru --version command
|
||||||
|
mock_run.return_value.returncode = 0
|
||||||
|
|
||||||
|
assert converter.is_mineru_available() is True
|
||||||
|
mock_run.assert_called_with(
|
||||||
|
["mineru", "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_is_mineru_available_fallback(self, mock_run, converter):
|
||||||
|
"""Test MinerU availability fallback to import check."""
|
||||||
|
# Mock mineru command not found, but module available
|
||||||
|
mock_run.side_effect = [
|
||||||
|
Mock(returncode=1), # mineru --version fails
|
||||||
|
Mock(returncode=0), # import mineru succeeds
|
||||||
|
]
|
||||||
|
|
||||||
|
assert converter.is_mineru_available() is True
|
||||||
|
assert mock_run.call_count == 2
|
||||||
|
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_is_mineru_unavailable(self, mock_run, converter):
|
||||||
|
"""Test when MinerU is completely unavailable."""
|
||||||
|
# Mock both command and import failing
|
||||||
|
mock_run.side_effect = [
|
||||||
|
Mock(returncode=1), # mineru --version fails
|
||||||
|
Mock(returncode=1), # import mineru fails
|
||||||
|
]
|
||||||
|
|
||||||
|
assert converter.is_mineru_available() is False
|
||||||
|
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_convert_paper_success(self, mock_run, converter, sample_metadata):
|
||||||
|
"""Test successful paper conversion."""
|
||||||
|
# Mock successful mineru command
|
||||||
|
mock_run.return_value.returncode = 0
|
||||||
|
|
||||||
|
# Create expected output structure in temp cache
|
||||||
|
cache_dir = converter.storage_manager.library_paths.cache_dir
|
||||||
|
temp_output_dir = cache_dir / f"mineru_temp_{sample_metadata.paper_id}"
|
||||||
|
pdf_stem = "test_convert_" + str(hash(sample_metadata))
|
||||||
|
mineru_output_dir = temp_output_dir / pdf_stem
|
||||||
|
mineru_output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Create expected output files
|
||||||
|
markdown_file = mineru_output_dir / f"{pdf_stem}.md"
|
||||||
|
images_dir = mineru_output_dir / "images"
|
||||||
|
|
||||||
|
markdown_file.write_text(
|
||||||
|
"# Test Markdown Content\n\nThis is converted content."
|
||||||
|
)
|
||||||
|
images_dir.mkdir(exist_ok=True)
|
||||||
|
(images_dir / "figure1.png").write_bytes(b"fake image data")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run conversion
|
||||||
|
result = converter.convert_paper(sample_metadata)
|
||||||
|
|
||||||
|
# Verify command was called correctly
|
||||||
|
expected_cmd = [
|
||||||
|
"mineru",
|
||||||
|
"-p",
|
||||||
|
mock_run.call_args[0][0][2], # PDF path
|
||||||
|
"-o",
|
||||||
|
mock_run.call_args[0][0][4], # Output dir
|
||||||
|
"-b",
|
||||||
|
"pipeline",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check that mineru was called with correct arguments
|
||||||
|
actual_cmd = mock_run.call_args[0][0]
|
||||||
|
assert actual_cmd[0] == "mineru"
|
||||||
|
assert "-p" in actual_cmd
|
||||||
|
assert "-o" in actual_cmd
|
||||||
|
assert "-b" in actual_cmd
|
||||||
|
assert "pipeline" in actual_cmd
|
||||||
|
|
||||||
|
# Verify conversion was successful
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Reload metadata and check status
|
||||||
|
updated_metadata = converter.storage_manager.load_paper_metadata(
|
||||||
|
sample_metadata.paper_id, sample_metadata.source_type
|
||||||
|
)
|
||||||
|
assert updated_metadata.conversion_status == ConversionStatus.SUCCESS
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Cleanup
|
||||||
|
if temp_output_dir.exists():
|
||||||
|
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_convert_paper_command_failure(self, mock_run, converter, sample_metadata):
|
||||||
|
"""Test conversion when mineru command fails."""
|
||||||
|
# Mock failed mineru command
|
||||||
|
mock_run.return_value.returncode = 1
|
||||||
|
|
||||||
|
result = converter.convert_paper(sample_metadata)
|
||||||
|
|
||||||
|
# Verify conversion failed
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
# Check metadata was updated with failure status
|
||||||
|
updated_metadata = converter.storage_manager.load_paper_metadata(
|
||||||
|
sample_metadata.paper_id, sample_metadata.source_type
|
||||||
|
)
|
||||||
|
assert updated_metadata.conversion_status == ConversionStatus.FAILED
|
||||||
|
|
||||||
|
def test_convert_paper_mineru_unavailable(self, converter, sample_metadata):
|
||||||
|
"""Test conversion when MinerU is not available."""
|
||||||
|
# Mock MinerU as unavailable
|
||||||
|
with patch.object(converter, "is_mineru_available", return_value=False):
|
||||||
|
result = converter.convert_paper(sample_metadata)
|
||||||
|
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_convert_paper_missing_pdf(self, converter, storage_manager):
|
||||||
|
"""Test conversion when PDF file is missing."""
|
||||||
|
# Create metadata pointing to non-existent PDF
|
||||||
|
metadata = PaperMetadata(
|
||||||
|
paper_id="missing-pdf-test",
|
||||||
|
source_type=SourceType.LOCAL,
|
||||||
|
title="Missing PDF Test",
|
||||||
|
pdf_path="nonexistent/path.pdf",
|
||||||
|
conversion_status=ConversionStatus.PENDING,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = converter.convert_paper(metadata)
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_convert_all_pending(self, converter, storage_manager):
|
||||||
|
"""Test converting all papers with pending status."""
|
||||||
|
# Create sample PDF
|
||||||
|
pdf_file = Path("./.tmp") / f"batch_test_{hash(self)}.pdf"
|
||||||
|
with pdf_file.open("wb") as f:
|
||||||
|
f.write(b"%PDF-1.4\n%%EOF\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Store multiple papers
|
||||||
|
papers = []
|
||||||
|
for i in range(3):
|
||||||
|
unique_pdf = Path("./.tmp") / f"batch_{i}_{hash(self)}.pdf"
|
||||||
|
shutil.copy2(pdf_file, unique_pdf)
|
||||||
|
|
||||||
|
try:
|
||||||
|
metadata = storage_manager.store_paper(
|
||||||
|
pdf_path=unique_pdf,
|
||||||
|
source_type=SourceType.LOCAL,
|
||||||
|
title=f"Batch Paper {i}",
|
||||||
|
)
|
||||||
|
papers.append(metadata)
|
||||||
|
finally:
|
||||||
|
if unique_pdf.exists():
|
||||||
|
unique_pdf.unlink()
|
||||||
|
|
||||||
|
# Mock conversions: 2 succeed, 1 fails
|
||||||
|
with patch.object(converter, "convert_paper") as mock_convert:
|
||||||
|
mock_convert.side_effect = [True, False, True]
|
||||||
|
|
||||||
|
success_count, failure_count = converter.convert_all_pending()
|
||||||
|
|
||||||
|
assert success_count == 2
|
||||||
|
assert failure_count == 1
|
||||||
|
assert mock_convert.call_count == 3
|
||||||
|
|
||||||
|
finally:
|
||||||
|
if pdf_file.exists():
|
||||||
|
pdf_file.unlink()
|
||||||
Reference in New Issue
Block a user