fix: fix the mineru call

This commit is contained in:
2026-04-17 17:37:17 -04:00
parent 174801242d
commit 9006bf36b8
3 changed files with 327 additions and 29 deletions
+20 -3
View File
@@ -94,6 +94,12 @@ def _build_parser() -> argparse.ArgumentParser:
) )
convert_parser.add_argument("--library", "-L", default=".", help="Library root") convert_parser.add_argument("--library", "-L", default=".", help="Library root")
convert_parser.add_argument("--paper-id", help="Convert specific paper by ID") convert_parser.add_argument("--paper-id", help="Convert specific paper by ID")
convert_parser.add_argument(
"--retry-failed", action="store_true", help="Retry failed conversions"
)
convert_parser.add_argument(
"--force", action="store_true", help="Force reconvert successful papers"
)
convert_parser.set_defaults(handler=_handle_convert) convert_parser.set_defaults(handler=_handle_convert)
# Reindex command # Reindex command
@@ -322,9 +328,20 @@ def _handle_convert(args: argparse.Namespace) -> int:
print(f"Paper not found: {args.paper_id}") print(f"Paper not found: {args.paper_id}")
return 1 return 1
else: else:
# Convert all pending papers # Convert papers based on flags
success_count, failure_count = converter.convert_all_pending() success_count, failure_count = converter.convert_all_pending(
msg = f"Complete: {success_count} successful, {failure_count} failed" retry_failed=args.retry_failed, force=args.force
)
# Show what was attempted
if args.force:
action = "Force converted"
elif args.retry_failed:
action = "Converted pending and retried failed"
else:
action = "Converted pending"
msg = f"{action}: {success_count} successful, {failure_count} failed"
print(msg) print(msg)
return 0 if failure_count == 0 else 1 return 0 if failure_count == 0 else 1
+74 -26
View File
@@ -18,16 +18,26 @@ class MinerUConverter:
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
def is_mineru_available(self) -> bool: def is_mineru_available(self) -> bool:
"""Check if MinerU is available in the environment.""" """Check if MinerU CLI is available in the environment."""
try: try:
# Check if mineru command is available
result = subprocess.run( result = subprocess.run(
[sys.executable, "-c", "import mineru"], ["mineru", "--version"],
capture_output=True, capture_output=True,
check=False, check=False,
) )
return result.returncode == 0 return result.returncode == 0
except (subprocess.SubprocessError, FileNotFoundError): except (subprocess.SubprocessError, FileNotFoundError):
return False # Fallback: check if mineru module is importable
try:
result = subprocess.run(
[sys.executable, "-c", "import mineru"],
capture_output=True,
check=False,
)
return result.returncode == 0
except (subprocess.SubprocessError, FileNotFoundError):
return False
def convert_paper(self, metadata: PaperMetadata) -> bool: def convert_paper(self, metadata: PaperMetadata) -> bool:
"""Convert a paper's PDF to Markdown using MinerU.""" """Convert a paper's PDF to Markdown using MinerU."""
@@ -52,18 +62,23 @@ class MinerUConverter:
self.storage_manager.update_paper_metadata(metadata) self.storage_manager.update_paper_metadata(metadata)
try: try:
# Create temporary output directory in cache
cache_dir = self.storage_manager.library_paths.cache_dir
temp_output_dir = cache_dir / f"mineru_temp_{metadata.paper_id}"
temp_output_dir.mkdir(exist_ok=True)
# Run MinerU conversion # Run MinerU conversion
log_file = logs_dir / "mineru.log" log_file = logs_dir / "mineru.log"
# MinerU command # Correct MinerU command
cmd = [ cmd = [
sys.executable, "mineru",
"-m", "-p",
"magic_pdf.pipe.UNIPipe",
"--pdf",
str(pdf_path), str(pdf_path),
"--output-dir", "-o",
str(paths["directory"]), str(temp_output_dir),
"-b",
"pipeline", # CPU-only mode for compatibility
] ]
self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}") self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}")
@@ -73,24 +88,30 @@ class MinerUConverter:
cmd, cmd,
stdout=log, stdout=log,
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
cwd=paths["directory"],
check=False, check=False,
) )
# Check if conversion was successful # Check if conversion was successful
if result.returncode == 0: if result.returncode == 0:
# MinerU typically outputs markdown files, try to find the main one # MinerU outputs to <output_dir>/<filename>/
# Look for common output patterns pdf_stem = pdf_path.stem # Get filename without .pdf extension
markdown_candidates = list(paths["directory"].glob("*.md")) mineru_output_dir = temp_output_dir / pdf_stem
if not markdown_candidates: expected_markdown = mineru_output_dir / f"{pdf_stem}.md"
# Try subdirectories expected_images = mineru_output_dir / "images"
markdown_candidates = list(paths["directory"].rglob("*.md"))
if markdown_candidates: if expected_markdown.exists():
# Use the first markdown file found, or rename if needed # Move markdown file to paper directory
main_md = markdown_candidates[0] expected_markdown.rename(markdown_path)
if main_md != markdown_path:
main_md.rename(markdown_path) # Move images directory if it exists
if expected_images.exists():
assets_target = paths["assets"]
if assets_target.exists():
# Remove existing assets directory
import shutil
shutil.rmtree(assets_target)
expected_images.rename(assets_target)
# Update metadata # Update metadata
metadata.conversion_status = ConversionStatus.SUCCESS metadata.conversion_status = ConversionStatus.SUCCESS
@@ -99,9 +120,16 @@ class MinerUConverter:
self.logger.info( self.logger.info(
f"Successfully converted {pdf_path} to {markdown_path}" f"Successfully converted {pdf_path} to {markdown_path}"
) )
# Clean up temporary directory
import shutil
shutil.rmtree(temp_output_dir)
return True return True
else: else:
self.logger.error("No markdown output found after conversion") self.logger.error(
f"Expected markdown file not found: {expected_markdown}"
)
metadata.conversion_status = ConversionStatus.FAILED metadata.conversion_status = ConversionStatus.FAILED
self.storage_manager.update_paper_metadata(metadata) self.storage_manager.update_paper_metadata(metadata)
return False return False
@@ -118,14 +146,34 @@ class MinerUConverter:
metadata.conversion_status = ConversionStatus.FAILED metadata.conversion_status = ConversionStatus.FAILED
self.storage_manager.update_paper_metadata(metadata) self.storage_manager.update_paper_metadata(metadata)
return False return False
finally:
# Ensure cleanup of temp directory
if "temp_output_dir" in locals() and temp_output_dir.exists():
import shutil
def convert_all_pending(self) -> tuple[int, int]: shutil.rmtree(temp_output_dir, ignore_errors=True)
"""Convert all papers with pending conversion status."""
def convert_all_pending(
self, retry_failed: bool = False, force: bool = False
) -> tuple[int, int]:
"""Convert papers based on their conversion status."""
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0
for metadata in self.storage_manager.list_all_papers(): for metadata in self.storage_manager.list_all_papers():
if metadata.conversion_status == ConversionStatus.PENDING: should_convert = False
if force:
# Force convert all papers
should_convert = True
elif metadata.conversion_status == ConversionStatus.PENDING:
# Convert pending papers
should_convert = True
elif retry_failed and metadata.conversion_status == ConversionStatus.FAILED:
# Retry failed conversions if requested
should_convert = True
if should_convert:
if self.convert_paper(metadata): if self.convert_paper(metadata):
success_count += 1 success_count += 1
else: else:
+233
View File
@@ -0,0 +1,233 @@
"""Tests for paperlib PDF converter."""
import shutil
from pathlib import Path
from unittest.mock import Mock, patch
import pytest
from paperlib.config import LibraryPaths
from paperlib.converter import MinerUConverter
from paperlib.models import ConversionStatus, PaperMetadata, SourceType
from paperlib.storage import PaperStorageManager
class TestMinerUConverter:
"""Test MinerUConverter functionality."""
@pytest.fixture
def temp_library(self):
"""Create a temporary library for testing."""
temp_dir = Path("./.tmp") / f"test_converter_{hash(self)}"
temp_dir.mkdir(parents=True, exist_ok=True)
library_paths = LibraryPaths.from_root(temp_dir)
library_paths.create_directories()
yield library_paths
# Cleanup
if temp_dir.exists():
shutil.rmtree(temp_dir)
@pytest.fixture
def storage_manager(self, temp_library):
"""Create a storage manager for testing."""
return PaperStorageManager(temp_library)
@pytest.fixture
def converter(self, storage_manager):
"""Create a MinerUConverter for testing."""
return MinerUConverter(storage_manager)
@pytest.fixture
def sample_metadata(self, storage_manager):
"""Create sample paper metadata for testing."""
# Create a sample PDF file
pdf_file = Path("./.tmp") / f"test_convert_{hash(self)}.pdf"
with pdf_file.open("wb") as f:
f.write(b"%PDF-1.4\n")
f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
f.write(b"%%EOF\n")
# Store the paper
metadata = storage_manager.store_paper(
pdf_path=pdf_file,
source_type=SourceType.LOCAL,
title="Test Paper for Conversion",
)
return metadata
@patch("subprocess.run")
def test_is_mineru_available_cli(self, mock_run, converter):
"""Test MinerU availability check using CLI."""
# Mock successful mineru --version command
mock_run.return_value.returncode = 0
assert converter.is_mineru_available() is True
mock_run.assert_called_with(
["mineru", "--version"],
capture_output=True,
check=False,
)
@patch("subprocess.run")
def test_is_mineru_available_fallback(self, mock_run, converter):
"""Test MinerU availability fallback to import check."""
# Mock mineru command not found, but module available
mock_run.side_effect = [
Mock(returncode=1), # mineru --version fails
Mock(returncode=0), # import mineru succeeds
]
assert converter.is_mineru_available() is True
assert mock_run.call_count == 2
@patch("subprocess.run")
def test_is_mineru_unavailable(self, mock_run, converter):
"""Test when MinerU is completely unavailable."""
# Mock both command and import failing
mock_run.side_effect = [
Mock(returncode=1), # mineru --version fails
Mock(returncode=1), # import mineru fails
]
assert converter.is_mineru_available() is False
@patch("subprocess.run")
def test_convert_paper_success(self, mock_run, converter, sample_metadata):
"""Test successful paper conversion."""
# Mock successful mineru command
mock_run.return_value.returncode = 0
# Create expected output structure in temp cache
cache_dir = converter.storage_manager.library_paths.cache_dir
temp_output_dir = cache_dir / f"mineru_temp_{sample_metadata.paper_id}"
pdf_stem = "test_convert_" + str(hash(sample_metadata))
mineru_output_dir = temp_output_dir / pdf_stem
mineru_output_dir.mkdir(parents=True, exist_ok=True)
# Create expected output files
markdown_file = mineru_output_dir / f"{pdf_stem}.md"
images_dir = mineru_output_dir / "images"
markdown_file.write_text(
"# Test Markdown Content\n\nThis is converted content."
)
images_dir.mkdir(exist_ok=True)
(images_dir / "figure1.png").write_bytes(b"fake image data")
try:
# Run conversion
result = converter.convert_paper(sample_metadata)
# Verify command was called correctly
expected_cmd = [
"mineru",
"-p",
mock_run.call_args[0][0][2], # PDF path
"-o",
mock_run.call_args[0][0][4], # Output dir
"-b",
"pipeline",
]
# Check that mineru was called with correct arguments
actual_cmd = mock_run.call_args[0][0]
assert actual_cmd[0] == "mineru"
assert "-p" in actual_cmd
assert "-o" in actual_cmd
assert "-b" in actual_cmd
assert "pipeline" in actual_cmd
# Verify conversion was successful
assert result is True
# Reload metadata and check status
updated_metadata = converter.storage_manager.load_paper_metadata(
sample_metadata.paper_id, sample_metadata.source_type
)
assert updated_metadata.conversion_status == ConversionStatus.SUCCESS
finally:
# Cleanup
if temp_output_dir.exists():
shutil.rmtree(temp_output_dir, ignore_errors=True)
@patch("subprocess.run")
def test_convert_paper_command_failure(self, mock_run, converter, sample_metadata):
"""Test conversion when mineru command fails."""
# Mock failed mineru command
mock_run.return_value.returncode = 1
result = converter.convert_paper(sample_metadata)
# Verify conversion failed
assert result is False
# Check metadata was updated with failure status
updated_metadata = converter.storage_manager.load_paper_metadata(
sample_metadata.paper_id, sample_metadata.source_type
)
assert updated_metadata.conversion_status == ConversionStatus.FAILED
def test_convert_paper_mineru_unavailable(self, converter, sample_metadata):
"""Test conversion when MinerU is not available."""
# Mock MinerU as unavailable
with patch.object(converter, "is_mineru_available", return_value=False):
result = converter.convert_paper(sample_metadata)
assert result is False
def test_convert_paper_missing_pdf(self, converter, storage_manager):
"""Test conversion when PDF file is missing."""
# Create metadata pointing to non-existent PDF
metadata = PaperMetadata(
paper_id="missing-pdf-test",
source_type=SourceType.LOCAL,
title="Missing PDF Test",
pdf_path="nonexistent/path.pdf",
conversion_status=ConversionStatus.PENDING,
)
result = converter.convert_paper(metadata)
assert result is False
def test_convert_all_pending(self, converter, storage_manager):
"""Test converting all papers with pending status."""
# Create sample PDF
pdf_file = Path("./.tmp") / f"batch_test_{hash(self)}.pdf"
with pdf_file.open("wb") as f:
f.write(b"%PDF-1.4\n%%EOF\n")
try:
# Store multiple papers
papers = []
for i in range(3):
unique_pdf = Path("./.tmp") / f"batch_{i}_{hash(self)}.pdf"
shutil.copy2(pdf_file, unique_pdf)
try:
metadata = storage_manager.store_paper(
pdf_path=unique_pdf,
source_type=SourceType.LOCAL,
title=f"Batch Paper {i}",
)
papers.append(metadata)
finally:
if unique_pdf.exists():
unique_pdf.unlink()
# Mock conversions: 2 succeed, 1 fails
with patch.object(converter, "convert_paper") as mock_convert:
mock_convert.side_effect = [True, False, True]
success_count, failure_count = converter.convert_all_pending()
assert success_count == 2
assert failure_count == 1
assert mock_convert.call_count == 3
finally:
if pdf_file.exists():
pdf_file.unlink()