fix: fix the mineru call

This commit is contained in:
2026-04-17 17:37:17 -04:00
parent 174801242d
commit 9006bf36b8
3 changed files with 327 additions and 29 deletions
+20 -3
View File
@@ -94,6 +94,12 @@ def _build_parser() -> argparse.ArgumentParser:
)
convert_parser.add_argument("--library", "-L", default=".", help="Library root")
convert_parser.add_argument("--paper-id", help="Convert specific paper by ID")
convert_parser.add_argument(
"--retry-failed", action="store_true", help="Retry failed conversions"
)
convert_parser.add_argument(
"--force", action="store_true", help="Force reconvert successful papers"
)
convert_parser.set_defaults(handler=_handle_convert)
# Reindex command
@@ -322,9 +328,20 @@ def _handle_convert(args: argparse.Namespace) -> int:
print(f"Paper not found: {args.paper_id}")
return 1
else:
# Convert all pending papers
success_count, failure_count = converter.convert_all_pending()
msg = f"Complete: {success_count} successful, {failure_count} failed"
# Convert papers based on flags
success_count, failure_count = converter.convert_all_pending(
retry_failed=args.retry_failed, force=args.force
)
# Show what was attempted
if args.force:
action = "Force converted"
elif args.retry_failed:
action = "Converted pending and retried failed"
else:
action = "Converted pending"
msg = f"{action}: {success_count} successful, {failure_count} failed"
print(msg)
return 0 if failure_count == 0 else 1
+74 -26
View File
@@ -18,16 +18,26 @@ class MinerUConverter:
self.logger = logging.getLogger(__name__)
def is_mineru_available(self) -> bool:
"""Check if MinerU is available in the environment."""
"""Check if MinerU CLI is available in the environment."""
try:
# Check if mineru command is available
result = subprocess.run(
[sys.executable, "-c", "import mineru"],
["mineru", "--version"],
capture_output=True,
check=False,
)
return result.returncode == 0
except (subprocess.SubprocessError, FileNotFoundError):
return False
# Fallback: check if mineru module is importable
try:
result = subprocess.run(
[sys.executable, "-c", "import mineru"],
capture_output=True,
check=False,
)
return result.returncode == 0
except (subprocess.SubprocessError, FileNotFoundError):
return False
def convert_paper(self, metadata: PaperMetadata) -> bool:
"""Convert a paper's PDF to Markdown using MinerU."""
@@ -52,18 +62,23 @@ class MinerUConverter:
self.storage_manager.update_paper_metadata(metadata)
try:
# Create temporary output directory in cache
cache_dir = self.storage_manager.library_paths.cache_dir
temp_output_dir = cache_dir / f"mineru_temp_{metadata.paper_id}"
temp_output_dir.mkdir(exist_ok=True)
# Run MinerU conversion
log_file = logs_dir / "mineru.log"
# MinerU command
# Correct MinerU command
cmd = [
sys.executable,
"-m",
"magic_pdf.pipe.UNIPipe",
"--pdf",
"mineru",
"-p",
str(pdf_path),
"--output-dir",
str(paths["directory"]),
"-o",
str(temp_output_dir),
"-b",
"pipeline", # CPU-only mode for compatibility
]
self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}")
@@ -73,24 +88,30 @@ class MinerUConverter:
cmd,
stdout=log,
stderr=subprocess.STDOUT,
cwd=paths["directory"],
check=False,
)
# Check if conversion was successful
if result.returncode == 0:
# MinerU typically outputs markdown files, try to find the main one
# Look for common output patterns
markdown_candidates = list(paths["directory"].glob("*.md"))
if not markdown_candidates:
# Try subdirectories
markdown_candidates = list(paths["directory"].rglob("*.md"))
# MinerU outputs to <output_dir>/<filename>/
pdf_stem = pdf_path.stem # Get filename without .pdf extension
mineru_output_dir = temp_output_dir / pdf_stem
expected_markdown = mineru_output_dir / f"{pdf_stem}.md"
expected_images = mineru_output_dir / "images"
if markdown_candidates:
# Use the first markdown file found, or rename if needed
main_md = markdown_candidates[0]
if main_md != markdown_path:
main_md.rename(markdown_path)
if expected_markdown.exists():
# Move markdown file to paper directory
expected_markdown.rename(markdown_path)
# Move images directory if it exists
if expected_images.exists():
assets_target = paths["assets"]
if assets_target.exists():
# Remove existing assets directory
import shutil
shutil.rmtree(assets_target)
expected_images.rename(assets_target)
# Update metadata
metadata.conversion_status = ConversionStatus.SUCCESS
@@ -99,9 +120,16 @@ class MinerUConverter:
self.logger.info(
f"Successfully converted {pdf_path} to {markdown_path}"
)
# Clean up temporary directory
import shutil
shutil.rmtree(temp_output_dir)
return True
else:
self.logger.error("No markdown output found after conversion")
self.logger.error(
f"Expected markdown file not found: {expected_markdown}"
)
metadata.conversion_status = ConversionStatus.FAILED
self.storage_manager.update_paper_metadata(metadata)
return False
@@ -118,14 +146,34 @@ class MinerUConverter:
metadata.conversion_status = ConversionStatus.FAILED
self.storage_manager.update_paper_metadata(metadata)
return False
finally:
# Ensure cleanup of temp directory
if "temp_output_dir" in locals() and temp_output_dir.exists():
import shutil
def convert_all_pending(self) -> tuple[int, int]:
"""Convert all papers with pending conversion status."""
shutil.rmtree(temp_output_dir, ignore_errors=True)
def convert_all_pending(
self, retry_failed: bool = False, force: bool = False
) -> tuple[int, int]:
"""Convert papers based on their conversion status."""
success_count = 0
failure_count = 0
for metadata in self.storage_manager.list_all_papers():
if metadata.conversion_status == ConversionStatus.PENDING:
should_convert = False
if force:
# Force convert all papers
should_convert = True
elif metadata.conversion_status == ConversionStatus.PENDING:
# Convert pending papers
should_convert = True
elif retry_failed and metadata.conversion_status == ConversionStatus.FAILED:
# Retry failed conversions if requested
should_convert = True
if should_convert:
if self.convert_paper(metadata):
success_count += 1
else:
+233
View File
@@ -0,0 +1,233 @@
"""Tests for paperlib PDF converter."""
import shutil
from pathlib import Path
from unittest.mock import Mock, patch
import pytest
from paperlib.config import LibraryPaths
from paperlib.converter import MinerUConverter
from paperlib.models import ConversionStatus, PaperMetadata, SourceType
from paperlib.storage import PaperStorageManager
class TestMinerUConverter:
"""Test MinerUConverter functionality."""
@pytest.fixture
def temp_library(self):
"""Create a temporary library for testing."""
temp_dir = Path("./.tmp") / f"test_converter_{hash(self)}"
temp_dir.mkdir(parents=True, exist_ok=True)
library_paths = LibraryPaths.from_root(temp_dir)
library_paths.create_directories()
yield library_paths
# Cleanup
if temp_dir.exists():
shutil.rmtree(temp_dir)
@pytest.fixture
def storage_manager(self, temp_library):
"""Create a storage manager for testing."""
return PaperStorageManager(temp_library)
@pytest.fixture
def converter(self, storage_manager):
"""Create a MinerUConverter for testing."""
return MinerUConverter(storage_manager)
@pytest.fixture
def sample_metadata(self, storage_manager):
"""Create sample paper metadata for testing."""
# Create a sample PDF file
pdf_file = Path("./.tmp") / f"test_convert_{hash(self)}.pdf"
with pdf_file.open("wb") as f:
f.write(b"%PDF-1.4\n")
f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
f.write(b"%%EOF\n")
# Store the paper
metadata = storage_manager.store_paper(
pdf_path=pdf_file,
source_type=SourceType.LOCAL,
title="Test Paper for Conversion",
)
return metadata
@patch("subprocess.run")
def test_is_mineru_available_cli(self, mock_run, converter):
"""Test MinerU availability check using CLI."""
# Mock successful mineru --version command
mock_run.return_value.returncode = 0
assert converter.is_mineru_available() is True
mock_run.assert_called_with(
["mineru", "--version"],
capture_output=True,
check=False,
)
@patch("subprocess.run")
def test_is_mineru_available_fallback(self, mock_run, converter):
"""Test MinerU availability fallback to import check."""
# Mock mineru command not found, but module available
mock_run.side_effect = [
Mock(returncode=1), # mineru --version fails
Mock(returncode=0), # import mineru succeeds
]
assert converter.is_mineru_available() is True
assert mock_run.call_count == 2
@patch("subprocess.run")
def test_is_mineru_unavailable(self, mock_run, converter):
"""Test when MinerU is completely unavailable."""
# Mock both command and import failing
mock_run.side_effect = [
Mock(returncode=1), # mineru --version fails
Mock(returncode=1), # import mineru fails
]
assert converter.is_mineru_available() is False
@patch("subprocess.run")
def test_convert_paper_success(self, mock_run, converter, sample_metadata):
"""Test successful paper conversion."""
# Mock successful mineru command
mock_run.return_value.returncode = 0
# Create expected output structure in temp cache
cache_dir = converter.storage_manager.library_paths.cache_dir
temp_output_dir = cache_dir / f"mineru_temp_{sample_metadata.paper_id}"
pdf_stem = "test_convert_" + str(hash(sample_metadata))
mineru_output_dir = temp_output_dir / pdf_stem
mineru_output_dir.mkdir(parents=True, exist_ok=True)
# Create expected output files
markdown_file = mineru_output_dir / f"{pdf_stem}.md"
images_dir = mineru_output_dir / "images"
markdown_file.write_text(
"# Test Markdown Content\n\nThis is converted content."
)
images_dir.mkdir(exist_ok=True)
(images_dir / "figure1.png").write_bytes(b"fake image data")
try:
# Run conversion
result = converter.convert_paper(sample_metadata)
# Verify command was called correctly
expected_cmd = [
"mineru",
"-p",
mock_run.call_args[0][0][2], # PDF path
"-o",
mock_run.call_args[0][0][4], # Output dir
"-b",
"pipeline",
]
# Check that mineru was called with correct arguments
actual_cmd = mock_run.call_args[0][0]
assert actual_cmd[0] == "mineru"
assert "-p" in actual_cmd
assert "-o" in actual_cmd
assert "-b" in actual_cmd
assert "pipeline" in actual_cmd
# Verify conversion was successful
assert result is True
# Reload metadata and check status
updated_metadata = converter.storage_manager.load_paper_metadata(
sample_metadata.paper_id, sample_metadata.source_type
)
assert updated_metadata.conversion_status == ConversionStatus.SUCCESS
finally:
# Cleanup
if temp_output_dir.exists():
shutil.rmtree(temp_output_dir, ignore_errors=True)
@patch("subprocess.run")
def test_convert_paper_command_failure(self, mock_run, converter, sample_metadata):
"""Test conversion when mineru command fails."""
# Mock failed mineru command
mock_run.return_value.returncode = 1
result = converter.convert_paper(sample_metadata)
# Verify conversion failed
assert result is False
# Check metadata was updated with failure status
updated_metadata = converter.storage_manager.load_paper_metadata(
sample_metadata.paper_id, sample_metadata.source_type
)
assert updated_metadata.conversion_status == ConversionStatus.FAILED
def test_convert_paper_mineru_unavailable(self, converter, sample_metadata):
"""Test conversion when MinerU is not available."""
# Mock MinerU as unavailable
with patch.object(converter, "is_mineru_available", return_value=False):
result = converter.convert_paper(sample_metadata)
assert result is False
def test_convert_paper_missing_pdf(self, converter, storage_manager):
"""Test conversion when PDF file is missing."""
# Create metadata pointing to non-existent PDF
metadata = PaperMetadata(
paper_id="missing-pdf-test",
source_type=SourceType.LOCAL,
title="Missing PDF Test",
pdf_path="nonexistent/path.pdf",
conversion_status=ConversionStatus.PENDING,
)
result = converter.convert_paper(metadata)
assert result is False
def test_convert_all_pending(self, converter, storage_manager):
"""Test converting all papers with pending status."""
# Create sample PDF
pdf_file = Path("./.tmp") / f"batch_test_{hash(self)}.pdf"
with pdf_file.open("wb") as f:
f.write(b"%PDF-1.4\n%%EOF\n")
try:
# Store multiple papers
papers = []
for i in range(3):
unique_pdf = Path("./.tmp") / f"batch_{i}_{hash(self)}.pdf"
shutil.copy2(pdf_file, unique_pdf)
try:
metadata = storage_manager.store_paper(
pdf_path=unique_pdf,
source_type=SourceType.LOCAL,
title=f"Batch Paper {i}",
)
papers.append(metadata)
finally:
if unique_pdf.exists():
unique_pdf.unlink()
# Mock conversions: 2 succeed, 1 fails
with patch.object(converter, "convert_paper") as mock_convert:
mock_convert.side_effect = [True, False, True]
success_count, failure_count = converter.convert_all_pending()
assert success_count == 2
assert failure_count == 1
assert mock_convert.call_count == 3
finally:
if pdf_file.exists():
pdf_file.unlink()