fix: fix the mineru call
This commit is contained in:
+20
-3
@@ -94,6 +94,12 @@ def _build_parser() -> argparse.ArgumentParser:
|
||||
)
|
||||
convert_parser.add_argument("--library", "-L", default=".", help="Library root")
|
||||
convert_parser.add_argument("--paper-id", help="Convert specific paper by ID")
|
||||
convert_parser.add_argument(
|
||||
"--retry-failed", action="store_true", help="Retry failed conversions"
|
||||
)
|
||||
convert_parser.add_argument(
|
||||
"--force", action="store_true", help="Force reconvert successful papers"
|
||||
)
|
||||
convert_parser.set_defaults(handler=_handle_convert)
|
||||
|
||||
# Reindex command
|
||||
@@ -322,9 +328,20 @@ def _handle_convert(args: argparse.Namespace) -> int:
|
||||
print(f"Paper not found: {args.paper_id}")
|
||||
return 1
|
||||
else:
|
||||
# Convert all pending papers
|
||||
success_count, failure_count = converter.convert_all_pending()
|
||||
msg = f"Complete: {success_count} successful, {failure_count} failed"
|
||||
# Convert papers based on flags
|
||||
success_count, failure_count = converter.convert_all_pending(
|
||||
retry_failed=args.retry_failed, force=args.force
|
||||
)
|
||||
|
||||
# Show what was attempted
|
||||
if args.force:
|
||||
action = "Force converted"
|
||||
elif args.retry_failed:
|
||||
action = "Converted pending and retried failed"
|
||||
else:
|
||||
action = "Converted pending"
|
||||
|
||||
msg = f"{action}: {success_count} successful, {failure_count} failed"
|
||||
print(msg)
|
||||
return 0 if failure_count == 0 else 1
|
||||
|
||||
|
||||
@@ -18,7 +18,17 @@ class MinerUConverter:
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def is_mineru_available(self) -> bool:
|
||||
"""Check if MinerU is available in the environment."""
|
||||
"""Check if MinerU CLI is available in the environment."""
|
||||
try:
|
||||
# Check if mineru command is available
|
||||
result = subprocess.run(
|
||||
["mineru", "--version"],
|
||||
capture_output=True,
|
||||
check=False,
|
||||
)
|
||||
return result.returncode == 0
|
||||
except (subprocess.SubprocessError, FileNotFoundError):
|
||||
# Fallback: check if mineru module is importable
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[sys.executable, "-c", "import mineru"],
|
||||
@@ -52,18 +62,23 @@ class MinerUConverter:
|
||||
self.storage_manager.update_paper_metadata(metadata)
|
||||
|
||||
try:
|
||||
# Create temporary output directory in cache
|
||||
cache_dir = self.storage_manager.library_paths.cache_dir
|
||||
temp_output_dir = cache_dir / f"mineru_temp_{metadata.paper_id}"
|
||||
temp_output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Run MinerU conversion
|
||||
log_file = logs_dir / "mineru.log"
|
||||
|
||||
# MinerU command
|
||||
# Correct MinerU command
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"magic_pdf.pipe.UNIPipe",
|
||||
"--pdf",
|
||||
"mineru",
|
||||
"-p",
|
||||
str(pdf_path),
|
||||
"--output-dir",
|
||||
str(paths["directory"]),
|
||||
"-o",
|
||||
str(temp_output_dir),
|
||||
"-b",
|
||||
"pipeline", # CPU-only mode for compatibility
|
||||
]
|
||||
|
||||
self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}")
|
||||
@@ -73,24 +88,30 @@ class MinerUConverter:
|
||||
cmd,
|
||||
stdout=log,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=paths["directory"],
|
||||
check=False,
|
||||
)
|
||||
|
||||
# Check if conversion was successful
|
||||
if result.returncode == 0:
|
||||
# MinerU typically outputs markdown files, try to find the main one
|
||||
# Look for common output patterns
|
||||
markdown_candidates = list(paths["directory"].glob("*.md"))
|
||||
if not markdown_candidates:
|
||||
# Try subdirectories
|
||||
markdown_candidates = list(paths["directory"].rglob("*.md"))
|
||||
# MinerU outputs to <output_dir>/<filename>/
|
||||
pdf_stem = pdf_path.stem # Get filename without .pdf extension
|
||||
mineru_output_dir = temp_output_dir / pdf_stem
|
||||
expected_markdown = mineru_output_dir / f"{pdf_stem}.md"
|
||||
expected_images = mineru_output_dir / "images"
|
||||
|
||||
if markdown_candidates:
|
||||
# Use the first markdown file found, or rename if needed
|
||||
main_md = markdown_candidates[0]
|
||||
if main_md != markdown_path:
|
||||
main_md.rename(markdown_path)
|
||||
if expected_markdown.exists():
|
||||
# Move markdown file to paper directory
|
||||
expected_markdown.rename(markdown_path)
|
||||
|
||||
# Move images directory if it exists
|
||||
if expected_images.exists():
|
||||
assets_target = paths["assets"]
|
||||
if assets_target.exists():
|
||||
# Remove existing assets directory
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(assets_target)
|
||||
expected_images.rename(assets_target)
|
||||
|
||||
# Update metadata
|
||||
metadata.conversion_status = ConversionStatus.SUCCESS
|
||||
@@ -99,9 +120,16 @@ class MinerUConverter:
|
||||
self.logger.info(
|
||||
f"Successfully converted {pdf_path} to {markdown_path}"
|
||||
)
|
||||
|
||||
# Clean up temporary directory
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(temp_output_dir)
|
||||
return True
|
||||
else:
|
||||
self.logger.error("No markdown output found after conversion")
|
||||
self.logger.error(
|
||||
f"Expected markdown file not found: {expected_markdown}"
|
||||
)
|
||||
metadata.conversion_status = ConversionStatus.FAILED
|
||||
self.storage_manager.update_paper_metadata(metadata)
|
||||
return False
|
||||
@@ -118,14 +146,34 @@ class MinerUConverter:
|
||||
metadata.conversion_status = ConversionStatus.FAILED
|
||||
self.storage_manager.update_paper_metadata(metadata)
|
||||
return False
|
||||
finally:
|
||||
# Ensure cleanup of temp directory
|
||||
if "temp_output_dir" in locals() and temp_output_dir.exists():
|
||||
import shutil
|
||||
|
||||
def convert_all_pending(self) -> tuple[int, int]:
|
||||
"""Convert all papers with pending conversion status."""
|
||||
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
||||
|
||||
def convert_all_pending(
|
||||
self, retry_failed: bool = False, force: bool = False
|
||||
) -> tuple[int, int]:
|
||||
"""Convert papers based on their conversion status."""
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for metadata in self.storage_manager.list_all_papers():
|
||||
if metadata.conversion_status == ConversionStatus.PENDING:
|
||||
should_convert = False
|
||||
|
||||
if force:
|
||||
# Force convert all papers
|
||||
should_convert = True
|
||||
elif metadata.conversion_status == ConversionStatus.PENDING:
|
||||
# Convert pending papers
|
||||
should_convert = True
|
||||
elif retry_failed and metadata.conversion_status == ConversionStatus.FAILED:
|
||||
# Retry failed conversions if requested
|
||||
should_convert = True
|
||||
|
||||
if should_convert:
|
||||
if self.convert_paper(metadata):
|
||||
success_count += 1
|
||||
else:
|
||||
|
||||
@@ -0,0 +1,233 @@
|
||||
"""Tests for paperlib PDF converter."""
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.converter import MinerUConverter
|
||||
from paperlib.models import ConversionStatus, PaperMetadata, SourceType
|
||||
from paperlib.storage import PaperStorageManager
|
||||
|
||||
|
||||
class TestMinerUConverter:
|
||||
"""Test MinerUConverter functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_library(self):
|
||||
"""Create a temporary library for testing."""
|
||||
temp_dir = Path("./.tmp") / f"test_converter_{hash(self)}"
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
library_paths = LibraryPaths.from_root(temp_dir)
|
||||
library_paths.create_directories()
|
||||
|
||||
yield library_paths
|
||||
|
||||
# Cleanup
|
||||
if temp_dir.exists():
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
@pytest.fixture
|
||||
def storage_manager(self, temp_library):
|
||||
"""Create a storage manager for testing."""
|
||||
return PaperStorageManager(temp_library)
|
||||
|
||||
@pytest.fixture
|
||||
def converter(self, storage_manager):
|
||||
"""Create a MinerUConverter for testing."""
|
||||
return MinerUConverter(storage_manager)
|
||||
|
||||
@pytest.fixture
|
||||
def sample_metadata(self, storage_manager):
|
||||
"""Create sample paper metadata for testing."""
|
||||
# Create a sample PDF file
|
||||
pdf_file = Path("./.tmp") / f"test_convert_{hash(self)}.pdf"
|
||||
with pdf_file.open("wb") as f:
|
||||
f.write(b"%PDF-1.4\n")
|
||||
f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
|
||||
f.write(b"%%EOF\n")
|
||||
|
||||
# Store the paper
|
||||
metadata = storage_manager.store_paper(
|
||||
pdf_path=pdf_file,
|
||||
source_type=SourceType.LOCAL,
|
||||
title="Test Paper for Conversion",
|
||||
)
|
||||
|
||||
return metadata
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_is_mineru_available_cli(self, mock_run, converter):
|
||||
"""Test MinerU availability check using CLI."""
|
||||
# Mock successful mineru --version command
|
||||
mock_run.return_value.returncode = 0
|
||||
|
||||
assert converter.is_mineru_available() is True
|
||||
mock_run.assert_called_with(
|
||||
["mineru", "--version"],
|
||||
capture_output=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_is_mineru_available_fallback(self, mock_run, converter):
|
||||
"""Test MinerU availability fallback to import check."""
|
||||
# Mock mineru command not found, but module available
|
||||
mock_run.side_effect = [
|
||||
Mock(returncode=1), # mineru --version fails
|
||||
Mock(returncode=0), # import mineru succeeds
|
||||
]
|
||||
|
||||
assert converter.is_mineru_available() is True
|
||||
assert mock_run.call_count == 2
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_is_mineru_unavailable(self, mock_run, converter):
|
||||
"""Test when MinerU is completely unavailable."""
|
||||
# Mock both command and import failing
|
||||
mock_run.side_effect = [
|
||||
Mock(returncode=1), # mineru --version fails
|
||||
Mock(returncode=1), # import mineru fails
|
||||
]
|
||||
|
||||
assert converter.is_mineru_available() is False
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_convert_paper_success(self, mock_run, converter, sample_metadata):
|
||||
"""Test successful paper conversion."""
|
||||
# Mock successful mineru command
|
||||
mock_run.return_value.returncode = 0
|
||||
|
||||
# Create expected output structure in temp cache
|
||||
cache_dir = converter.storage_manager.library_paths.cache_dir
|
||||
temp_output_dir = cache_dir / f"mineru_temp_{sample_metadata.paper_id}"
|
||||
pdf_stem = "test_convert_" + str(hash(sample_metadata))
|
||||
mineru_output_dir = temp_output_dir / pdf_stem
|
||||
mineru_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create expected output files
|
||||
markdown_file = mineru_output_dir / f"{pdf_stem}.md"
|
||||
images_dir = mineru_output_dir / "images"
|
||||
|
||||
markdown_file.write_text(
|
||||
"# Test Markdown Content\n\nThis is converted content."
|
||||
)
|
||||
images_dir.mkdir(exist_ok=True)
|
||||
(images_dir / "figure1.png").write_bytes(b"fake image data")
|
||||
|
||||
try:
|
||||
# Run conversion
|
||||
result = converter.convert_paper(sample_metadata)
|
||||
|
||||
# Verify command was called correctly
|
||||
expected_cmd = [
|
||||
"mineru",
|
||||
"-p",
|
||||
mock_run.call_args[0][0][2], # PDF path
|
||||
"-o",
|
||||
mock_run.call_args[0][0][4], # Output dir
|
||||
"-b",
|
||||
"pipeline",
|
||||
]
|
||||
|
||||
# Check that mineru was called with correct arguments
|
||||
actual_cmd = mock_run.call_args[0][0]
|
||||
assert actual_cmd[0] == "mineru"
|
||||
assert "-p" in actual_cmd
|
||||
assert "-o" in actual_cmd
|
||||
assert "-b" in actual_cmd
|
||||
assert "pipeline" in actual_cmd
|
||||
|
||||
# Verify conversion was successful
|
||||
assert result is True
|
||||
|
||||
# Reload metadata and check status
|
||||
updated_metadata = converter.storage_manager.load_paper_metadata(
|
||||
sample_metadata.paper_id, sample_metadata.source_type
|
||||
)
|
||||
assert updated_metadata.conversion_status == ConversionStatus.SUCCESS
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if temp_output_dir.exists():
|
||||
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_convert_paper_command_failure(self, mock_run, converter, sample_metadata):
|
||||
"""Test conversion when mineru command fails."""
|
||||
# Mock failed mineru command
|
||||
mock_run.return_value.returncode = 1
|
||||
|
||||
result = converter.convert_paper(sample_metadata)
|
||||
|
||||
# Verify conversion failed
|
||||
assert result is False
|
||||
|
||||
# Check metadata was updated with failure status
|
||||
updated_metadata = converter.storage_manager.load_paper_metadata(
|
||||
sample_metadata.paper_id, sample_metadata.source_type
|
||||
)
|
||||
assert updated_metadata.conversion_status == ConversionStatus.FAILED
|
||||
|
||||
def test_convert_paper_mineru_unavailable(self, converter, sample_metadata):
|
||||
"""Test conversion when MinerU is not available."""
|
||||
# Mock MinerU as unavailable
|
||||
with patch.object(converter, "is_mineru_available", return_value=False):
|
||||
result = converter.convert_paper(sample_metadata)
|
||||
|
||||
assert result is False
|
||||
|
||||
def test_convert_paper_missing_pdf(self, converter, storage_manager):
|
||||
"""Test conversion when PDF file is missing."""
|
||||
# Create metadata pointing to non-existent PDF
|
||||
metadata = PaperMetadata(
|
||||
paper_id="missing-pdf-test",
|
||||
source_type=SourceType.LOCAL,
|
||||
title="Missing PDF Test",
|
||||
pdf_path="nonexistent/path.pdf",
|
||||
conversion_status=ConversionStatus.PENDING,
|
||||
)
|
||||
|
||||
result = converter.convert_paper(metadata)
|
||||
assert result is False
|
||||
|
||||
def test_convert_all_pending(self, converter, storage_manager):
|
||||
"""Test converting all papers with pending status."""
|
||||
# Create sample PDF
|
||||
pdf_file = Path("./.tmp") / f"batch_test_{hash(self)}.pdf"
|
||||
with pdf_file.open("wb") as f:
|
||||
f.write(b"%PDF-1.4\n%%EOF\n")
|
||||
|
||||
try:
|
||||
# Store multiple papers
|
||||
papers = []
|
||||
for i in range(3):
|
||||
unique_pdf = Path("./.tmp") / f"batch_{i}_{hash(self)}.pdf"
|
||||
shutil.copy2(pdf_file, unique_pdf)
|
||||
|
||||
try:
|
||||
metadata = storage_manager.store_paper(
|
||||
pdf_path=unique_pdf,
|
||||
source_type=SourceType.LOCAL,
|
||||
title=f"Batch Paper {i}",
|
||||
)
|
||||
papers.append(metadata)
|
||||
finally:
|
||||
if unique_pdf.exists():
|
||||
unique_pdf.unlink()
|
||||
|
||||
# Mock conversions: 2 succeed, 1 fails
|
||||
with patch.object(converter, "convert_paper") as mock_convert:
|
||||
mock_convert.side_effect = [True, False, True]
|
||||
|
||||
success_count, failure_count = converter.convert_all_pending()
|
||||
|
||||
assert success_count == 2
|
||||
assert failure_count == 1
|
||||
assert mock_convert.call_count == 3
|
||||
|
||||
finally:
|
||||
if pdf_file.exists():
|
||||
pdf_file.unlink()
|
||||
Reference in New Issue
Block a user