fix: fix the mineru call
This commit is contained in:
@@ -0,0 +1,233 @@
|
||||
"""Tests for paperlib PDF converter."""
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.converter import MinerUConverter
|
||||
from paperlib.models import ConversionStatus, PaperMetadata, SourceType
|
||||
from paperlib.storage import PaperStorageManager
|
||||
|
||||
|
||||
class TestMinerUConverter:
|
||||
"""Test MinerUConverter functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_library(self):
|
||||
"""Create a temporary library for testing."""
|
||||
temp_dir = Path("./.tmp") / f"test_converter_{hash(self)}"
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
library_paths = LibraryPaths.from_root(temp_dir)
|
||||
library_paths.create_directories()
|
||||
|
||||
yield library_paths
|
||||
|
||||
# Cleanup
|
||||
if temp_dir.exists():
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
@pytest.fixture
|
||||
def storage_manager(self, temp_library):
|
||||
"""Create a storage manager for testing."""
|
||||
return PaperStorageManager(temp_library)
|
||||
|
||||
@pytest.fixture
|
||||
def converter(self, storage_manager):
|
||||
"""Create a MinerUConverter for testing."""
|
||||
return MinerUConverter(storage_manager)
|
||||
|
||||
@pytest.fixture
|
||||
def sample_metadata(self, storage_manager):
|
||||
"""Create sample paper metadata for testing."""
|
||||
# Create a sample PDF file
|
||||
pdf_file = Path("./.tmp") / f"test_convert_{hash(self)}.pdf"
|
||||
with pdf_file.open("wb") as f:
|
||||
f.write(b"%PDF-1.4\n")
|
||||
f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
|
||||
f.write(b"%%EOF\n")
|
||||
|
||||
# Store the paper
|
||||
metadata = storage_manager.store_paper(
|
||||
pdf_path=pdf_file,
|
||||
source_type=SourceType.LOCAL,
|
||||
title="Test Paper for Conversion",
|
||||
)
|
||||
|
||||
return metadata
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_is_mineru_available_cli(self, mock_run, converter):
|
||||
"""Test MinerU availability check using CLI."""
|
||||
# Mock successful mineru --version command
|
||||
mock_run.return_value.returncode = 0
|
||||
|
||||
assert converter.is_mineru_available() is True
|
||||
mock_run.assert_called_with(
|
||||
["mineru", "--version"],
|
||||
capture_output=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_is_mineru_available_fallback(self, mock_run, converter):
|
||||
"""Test MinerU availability fallback to import check."""
|
||||
# Mock mineru command not found, but module available
|
||||
mock_run.side_effect = [
|
||||
Mock(returncode=1), # mineru --version fails
|
||||
Mock(returncode=0), # import mineru succeeds
|
||||
]
|
||||
|
||||
assert converter.is_mineru_available() is True
|
||||
assert mock_run.call_count == 2
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_is_mineru_unavailable(self, mock_run, converter):
|
||||
"""Test when MinerU is completely unavailable."""
|
||||
# Mock both command and import failing
|
||||
mock_run.side_effect = [
|
||||
Mock(returncode=1), # mineru --version fails
|
||||
Mock(returncode=1), # import mineru fails
|
||||
]
|
||||
|
||||
assert converter.is_mineru_available() is False
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_convert_paper_success(self, mock_run, converter, sample_metadata):
|
||||
"""Test successful paper conversion."""
|
||||
# Mock successful mineru command
|
||||
mock_run.return_value.returncode = 0
|
||||
|
||||
# Create expected output structure in temp cache
|
||||
cache_dir = converter.storage_manager.library_paths.cache_dir
|
||||
temp_output_dir = cache_dir / f"mineru_temp_{sample_metadata.paper_id}"
|
||||
pdf_stem = "test_convert_" + str(hash(sample_metadata))
|
||||
mineru_output_dir = temp_output_dir / pdf_stem
|
||||
mineru_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create expected output files
|
||||
markdown_file = mineru_output_dir / f"{pdf_stem}.md"
|
||||
images_dir = mineru_output_dir / "images"
|
||||
|
||||
markdown_file.write_text(
|
||||
"# Test Markdown Content\n\nThis is converted content."
|
||||
)
|
||||
images_dir.mkdir(exist_ok=True)
|
||||
(images_dir / "figure1.png").write_bytes(b"fake image data")
|
||||
|
||||
try:
|
||||
# Run conversion
|
||||
result = converter.convert_paper(sample_metadata)
|
||||
|
||||
# Verify command was called correctly
|
||||
expected_cmd = [
|
||||
"mineru",
|
||||
"-p",
|
||||
mock_run.call_args[0][0][2], # PDF path
|
||||
"-o",
|
||||
mock_run.call_args[0][0][4], # Output dir
|
||||
"-b",
|
||||
"pipeline",
|
||||
]
|
||||
|
||||
# Check that mineru was called with correct arguments
|
||||
actual_cmd = mock_run.call_args[0][0]
|
||||
assert actual_cmd[0] == "mineru"
|
||||
assert "-p" in actual_cmd
|
||||
assert "-o" in actual_cmd
|
||||
assert "-b" in actual_cmd
|
||||
assert "pipeline" in actual_cmd
|
||||
|
||||
# Verify conversion was successful
|
||||
assert result is True
|
||||
|
||||
# Reload metadata and check status
|
||||
updated_metadata = converter.storage_manager.load_paper_metadata(
|
||||
sample_metadata.paper_id, sample_metadata.source_type
|
||||
)
|
||||
assert updated_metadata.conversion_status == ConversionStatus.SUCCESS
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if temp_output_dir.exists():
|
||||
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
||||
|
||||
@patch("subprocess.run")
|
||||
def test_convert_paper_command_failure(self, mock_run, converter, sample_metadata):
|
||||
"""Test conversion when mineru command fails."""
|
||||
# Mock failed mineru command
|
||||
mock_run.return_value.returncode = 1
|
||||
|
||||
result = converter.convert_paper(sample_metadata)
|
||||
|
||||
# Verify conversion failed
|
||||
assert result is False
|
||||
|
||||
# Check metadata was updated with failure status
|
||||
updated_metadata = converter.storage_manager.load_paper_metadata(
|
||||
sample_metadata.paper_id, sample_metadata.source_type
|
||||
)
|
||||
assert updated_metadata.conversion_status == ConversionStatus.FAILED
|
||||
|
||||
def test_convert_paper_mineru_unavailable(self, converter, sample_metadata):
|
||||
"""Test conversion when MinerU is not available."""
|
||||
# Mock MinerU as unavailable
|
||||
with patch.object(converter, "is_mineru_available", return_value=False):
|
||||
result = converter.convert_paper(sample_metadata)
|
||||
|
||||
assert result is False
|
||||
|
||||
def test_convert_paper_missing_pdf(self, converter, storage_manager):
|
||||
"""Test conversion when PDF file is missing."""
|
||||
# Create metadata pointing to non-existent PDF
|
||||
metadata = PaperMetadata(
|
||||
paper_id="missing-pdf-test",
|
||||
source_type=SourceType.LOCAL,
|
||||
title="Missing PDF Test",
|
||||
pdf_path="nonexistent/path.pdf",
|
||||
conversion_status=ConversionStatus.PENDING,
|
||||
)
|
||||
|
||||
result = converter.convert_paper(metadata)
|
||||
assert result is False
|
||||
|
||||
def test_convert_all_pending(self, converter, storage_manager):
|
||||
"""Test converting all papers with pending status."""
|
||||
# Create sample PDF
|
||||
pdf_file = Path("./.tmp") / f"batch_test_{hash(self)}.pdf"
|
||||
with pdf_file.open("wb") as f:
|
||||
f.write(b"%PDF-1.4\n%%EOF\n")
|
||||
|
||||
try:
|
||||
# Store multiple papers
|
||||
papers = []
|
||||
for i in range(3):
|
||||
unique_pdf = Path("./.tmp") / f"batch_{i}_{hash(self)}.pdf"
|
||||
shutil.copy2(pdf_file, unique_pdf)
|
||||
|
||||
try:
|
||||
metadata = storage_manager.store_paper(
|
||||
pdf_path=unique_pdf,
|
||||
source_type=SourceType.LOCAL,
|
||||
title=f"Batch Paper {i}",
|
||||
)
|
||||
papers.append(metadata)
|
||||
finally:
|
||||
if unique_pdf.exists():
|
||||
unique_pdf.unlink()
|
||||
|
||||
# Mock conversions: 2 succeed, 1 fails
|
||||
with patch.object(converter, "convert_paper") as mock_convert:
|
||||
mock_convert.side_effect = [True, False, True]
|
||||
|
||||
success_count, failure_count = converter.convert_all_pending()
|
||||
|
||||
assert success_count == 2
|
||||
assert failure_count == 1
|
||||
assert mock_convert.call_count == 3
|
||||
|
||||
finally:
|
||||
if pdf_file.exists():
|
||||
pdf_file.unlink()
|
||||
Reference in New Issue
Block a user