234 lines
8.3 KiB
Python
234 lines
8.3 KiB
Python
"""Tests for paperlib PDF converter."""
|
|
|
|
import shutil
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch
|
|
|
|
import pytest
|
|
|
|
from paperlib.config import LibraryPaths
|
|
from paperlib.converter import MinerUConverter
|
|
from paperlib.models import ConversionStatus, PaperMetadata, SourceType
|
|
from paperlib.storage import PaperStorageManager
|
|
|
|
|
|
class TestMinerUConverter:
|
|
"""Test MinerUConverter functionality."""
|
|
|
|
@pytest.fixture
|
|
def temp_library(self):
|
|
"""Create a temporary library for testing."""
|
|
temp_dir = Path("./.tmp") / f"test_converter_{hash(self)}"
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
library_paths = LibraryPaths.from_root(temp_dir)
|
|
library_paths.create_directories()
|
|
|
|
yield library_paths
|
|
|
|
# Cleanup
|
|
if temp_dir.exists():
|
|
shutil.rmtree(temp_dir)
|
|
|
|
@pytest.fixture
|
|
def storage_manager(self, temp_library):
|
|
"""Create a storage manager for testing."""
|
|
return PaperStorageManager(temp_library)
|
|
|
|
@pytest.fixture
|
|
def converter(self, storage_manager):
|
|
"""Create a MinerUConverter for testing."""
|
|
return MinerUConverter(storage_manager)
|
|
|
|
@pytest.fixture
|
|
def sample_metadata(self, storage_manager):
|
|
"""Create sample paper metadata for testing."""
|
|
# Create a sample PDF file
|
|
pdf_file = Path("./.tmp") / f"test_convert_{hash(self)}.pdf"
|
|
with pdf_file.open("wb") as f:
|
|
f.write(b"%PDF-1.4\n")
|
|
f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
|
|
f.write(b"%%EOF\n")
|
|
|
|
# Store the paper
|
|
metadata = storage_manager.store_paper(
|
|
pdf_path=pdf_file,
|
|
source_type=SourceType.LOCAL,
|
|
title="Test Paper for Conversion",
|
|
)
|
|
|
|
return metadata
|
|
|
|
@patch("subprocess.run")
|
|
def test_is_mineru_available_cli(self, mock_run, converter):
|
|
"""Test MinerU availability check using CLI."""
|
|
# Mock successful mineru --version command
|
|
mock_run.return_value.returncode = 0
|
|
|
|
assert converter.is_mineru_available() is True
|
|
mock_run.assert_called_with(
|
|
["mineru", "--version"],
|
|
capture_output=True,
|
|
check=False,
|
|
)
|
|
|
|
@patch("subprocess.run")
|
|
def test_is_mineru_available_fallback(self, mock_run, converter):
|
|
"""Test MinerU availability fallback to import check."""
|
|
# Mock mineru command not found, but module available
|
|
mock_run.side_effect = [
|
|
Mock(returncode=1), # mineru --version fails
|
|
Mock(returncode=0), # import mineru succeeds
|
|
]
|
|
|
|
assert converter.is_mineru_available() is True
|
|
assert mock_run.call_count == 2
|
|
|
|
@patch("subprocess.run")
|
|
def test_is_mineru_unavailable(self, mock_run, converter):
|
|
"""Test when MinerU is completely unavailable."""
|
|
# Mock both command and import failing
|
|
mock_run.side_effect = [
|
|
Mock(returncode=1), # mineru --version fails
|
|
Mock(returncode=1), # import mineru fails
|
|
]
|
|
|
|
assert converter.is_mineru_available() is False
|
|
|
|
@patch("subprocess.run")
|
|
def test_convert_paper_success(self, mock_run, converter, sample_metadata):
|
|
"""Test successful paper conversion."""
|
|
# Mock successful mineru command
|
|
mock_run.return_value.returncode = 0
|
|
|
|
# Create expected output structure in temp cache
|
|
cache_dir = converter.storage_manager.library_paths.cache_dir
|
|
temp_output_dir = cache_dir / f"mineru_temp_{sample_metadata.paper_id}"
|
|
pdf_stem = "test_convert_" + str(hash(sample_metadata))
|
|
mineru_output_dir = temp_output_dir / pdf_stem
|
|
mineru_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create expected output files
|
|
markdown_file = mineru_output_dir / f"{pdf_stem}.md"
|
|
images_dir = mineru_output_dir / "images"
|
|
|
|
markdown_file.write_text(
|
|
"# Test Markdown Content\n\nThis is converted content."
|
|
)
|
|
images_dir.mkdir(exist_ok=True)
|
|
(images_dir / "figure1.png").write_bytes(b"fake image data")
|
|
|
|
try:
|
|
# Run conversion
|
|
result = converter.convert_paper(sample_metadata)
|
|
|
|
# Verify command was called correctly
|
|
expected_cmd = [
|
|
"mineru",
|
|
"-p",
|
|
mock_run.call_args[0][0][2], # PDF path
|
|
"-o",
|
|
mock_run.call_args[0][0][4], # Output dir
|
|
"-b",
|
|
"pipeline",
|
|
]
|
|
|
|
# Check that mineru was called with correct arguments
|
|
actual_cmd = mock_run.call_args[0][0]
|
|
assert actual_cmd[0] == "mineru"
|
|
assert "-p" in actual_cmd
|
|
assert "-o" in actual_cmd
|
|
assert "-b" in actual_cmd
|
|
assert "pipeline" in actual_cmd
|
|
|
|
# Verify conversion was successful
|
|
assert result is True
|
|
|
|
# Reload metadata and check status
|
|
updated_metadata = converter.storage_manager.load_paper_metadata(
|
|
sample_metadata.paper_id, sample_metadata.source_type
|
|
)
|
|
assert updated_metadata.conversion_status == ConversionStatus.SUCCESS
|
|
|
|
finally:
|
|
# Cleanup
|
|
if temp_output_dir.exists():
|
|
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
|
|
|
@patch("subprocess.run")
|
|
def test_convert_paper_command_failure(self, mock_run, converter, sample_metadata):
|
|
"""Test conversion when mineru command fails."""
|
|
# Mock failed mineru command
|
|
mock_run.return_value.returncode = 1
|
|
|
|
result = converter.convert_paper(sample_metadata)
|
|
|
|
# Verify conversion failed
|
|
assert result is False
|
|
|
|
# Check metadata was updated with failure status
|
|
updated_metadata = converter.storage_manager.load_paper_metadata(
|
|
sample_metadata.paper_id, sample_metadata.source_type
|
|
)
|
|
assert updated_metadata.conversion_status == ConversionStatus.FAILED
|
|
|
|
def test_convert_paper_mineru_unavailable(self, converter, sample_metadata):
|
|
"""Test conversion when MinerU is not available."""
|
|
# Mock MinerU as unavailable
|
|
with patch.object(converter, "is_mineru_available", return_value=False):
|
|
result = converter.convert_paper(sample_metadata)
|
|
|
|
assert result is False
|
|
|
|
def test_convert_paper_missing_pdf(self, converter, storage_manager):
|
|
"""Test conversion when PDF file is missing."""
|
|
# Create metadata pointing to non-existent PDF
|
|
metadata = PaperMetadata(
|
|
paper_id="missing-pdf-test",
|
|
source_type=SourceType.LOCAL,
|
|
title="Missing PDF Test",
|
|
pdf_path="nonexistent/path.pdf",
|
|
conversion_status=ConversionStatus.PENDING,
|
|
)
|
|
|
|
result = converter.convert_paper(metadata)
|
|
assert result is False
|
|
|
|
def test_convert_all_pending(self, converter, storage_manager):
|
|
"""Test converting all papers with pending status."""
|
|
# Create sample PDF
|
|
pdf_file = Path("./.tmp") / f"batch_test_{hash(self)}.pdf"
|
|
with pdf_file.open("wb") as f:
|
|
f.write(b"%PDF-1.4\n%%EOF\n")
|
|
|
|
try:
|
|
# Store multiple papers
|
|
papers = []
|
|
for i in range(3):
|
|
unique_pdf = Path("./.tmp") / f"batch_{i}_{hash(self)}.pdf"
|
|
shutil.copy2(pdf_file, unique_pdf)
|
|
|
|
try:
|
|
metadata = storage_manager.store_paper(
|
|
pdf_path=unique_pdf,
|
|
source_type=SourceType.LOCAL,
|
|
title=f"Batch Paper {i}",
|
|
)
|
|
papers.append(metadata)
|
|
finally:
|
|
if unique_pdf.exists():
|
|
unique_pdf.unlink()
|
|
|
|
# Mock conversions: 2 succeed, 1 fails
|
|
with patch.object(converter, "convert_paper") as mock_convert:
|
|
mock_convert.side_effect = [True, False, True]
|
|
|
|
success_count, failure_count = converter.convert_all_pending()
|
|
|
|
assert success_count == 2
|
|
assert failure_count == 1
|
|
assert mock_convert.call_count == 3
|
|
|
|
finally:
|
|
if pdf_file.exists():
|
|
pdf_file.unlink()
|