paperlib/tests/test_importer.py

"""Tests for paperlib import functionality."""

import shutil
from pathlib import Path
from unittest.mock import Mock, patch

import pytest

from paperlib.config import LibraryPaths
from paperlib.importer import ArxivImporter, LocalImporter
from paperlib.models import SourceType
from paperlib.storage import PaperStorageManager


class TestLocalImporter:
    """Test LocalImporter functionality."""

    @pytest.fixture
    def temp_library(self):
        """Create a temporary library for testing."""
        temp_dir = Path("./.tmp") / f"test_import_{hash(self)}"
        temp_dir.mkdir(parents=True, exist_ok=True)
        library_paths = LibraryPaths.from_root(temp_dir)
        library_paths.create_directories()

        yield library_paths

        # Cleanup
        if temp_dir.exists():
            shutil.rmtree(temp_dir)

    @pytest.fixture
    def local_importer(self, temp_library):
        """Create a LocalImporter for testing."""
        storage_manager = PaperStorageManager(temp_library)
        return LocalImporter(storage_manager)

    @pytest.fixture
    def sample_pdf(self):
        """Create a sample PDF file for testing."""
        pdf_file = Path("./.tmp") / f"sample_{hash(self)}.pdf"
        with pdf_file.open("wb") as f:
            # Minimal PDF content
            f.write(b"%PDF-1.4\n")
            f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
            f.write(b"%%EOF\n")

        yield pdf_file

        # Cleanup
        if pdf_file.exists():
            pdf_file.unlink()

    def test_import_pdf_success(self, local_importer, sample_pdf):
        """Test successful PDF import."""
        metadata = local_importer.import_pdf(
            pdf_path=sample_pdf,
            title="Test Paper",
            notes="Test notes",
            tags=["test", "sample"],
        )

        # Check metadata
        assert metadata.source_type == SourceType.LOCAL
        assert metadata.title == "Test Paper"
        assert metadata.notes == "Test notes"
        assert metadata.tags == ["test", "sample"]
        assert metadata.paper_id.startswith("local-")

    def test_import_pdf_auto_title(self, local_importer, sample_pdf):
        """Test PDF import with auto-generated title."""
        # Rename PDF to have a meaningful name
        meaningful_pdf = sample_pdf.parent / "Machine_Learning-Paper.pdf"
        sample_pdf.rename(meaningful_pdf)

        try:
            metadata = local_importer.import_pdf(pdf_path=meaningful_pdf)

            # Title should be auto-generated from filename
            assert metadata.title == "Machine Learning Paper"

        finally:
            if meaningful_pdf.exists():
                meaningful_pdf.unlink()

    def test_import_nonexistent_pdf(self, local_importer):
        """Test importing non-existent PDF file."""
        nonexistent = Path("./.tmp/nonexistent.pdf")

        with pytest.raises(FileNotFoundError):
            local_importer.import_pdf(pdf_path=nonexistent)

    def test_import_non_pdf_file(self, local_importer):
        """Test importing non-PDF file."""
        text_file = Path("./.tmp") / "not_a_pdf.txt"
        with text_file.open("w") as f:
            f.write("This is not a PDF")

        try:
            with pytest.raises(ValueError, match="File is not a PDF"):
                local_importer.import_pdf(pdf_path=text_file)
        finally:
            if text_file.exists():
                text_file.unlink()

    def test_import_duplicate_pdf(self, local_importer, sample_pdf):
        """Test importing the same PDF twice."""
        # Import once
        metadata1 = local_importer.import_pdf(pdf_path=sample_pdf)

        # Try to import again
        with pytest.raises(ValueError, match="Paper already imported"):
            local_importer.import_pdf(pdf_path=sample_pdf)


class TestArxivImporter:
    """Test ArxivImporter functionality."""

    @pytest.fixture
    def temp_library(self):
        """Create a temporary library for testing."""
        temp_dir = Path("./.tmp") / f"test_arxiv_{hash(self)}"
        temp_dir.mkdir(parents=True, exist_ok=True)
        library_paths = LibraryPaths.from_root(temp_dir)
        library_paths.create_directories()

        yield library_paths

        # Cleanup
        if temp_dir.exists():
            shutil.rmtree(temp_dir)

    @pytest.fixture
    def arxiv_importer(self, temp_library):
        """Create an ArxivImporter for testing."""
        storage_manager = PaperStorageManager(temp_library)
        return ArxivImporter(storage_manager)

    def test_extract_arxiv_id_clean(self, arxiv_importer):
        """Test extracting clean arXiv ID."""
        # Test various formats
        assert arxiv_importer.extract_arxiv_id("2212.06340") == "2212.06340"
        assert arxiv_importer.extract_arxiv_id("arxiv:2212.06340") == "2212.06340"
        assert arxiv_importer.extract_arxiv_id("2212.06340v1") == "2212.06340v1"
        assert arxiv_importer.extract_arxiv_id("math-ph/0701002") == "math-ph/0701002"

    def test_extract_arxiv_id_from_url(self, arxiv_importer):
        """Test extracting arXiv ID from URLs."""
        url = "https://arxiv.org/abs/2212.06340"
        extracted = arxiv_importer.extract_arxiv_id(url)
        assert extracted == "2212.06340"

    def test_fetch_paper_metadata_success(self, arxiv_importer):
        """Test successful metadata fetching from arXiv."""
        # Mock arXiv result
        mock_result = Mock()
        mock_result.title = "Test Paper"
        mock_result.authors = [Mock(name="Alice Smith"), Mock(name="Bob Jones")]
        mock_result.published = Mock()
        mock_result.updated = Mock()
        mock_result.categories = ["cs.AI", "stat.ML"]

        # Mock the client's results method directly
        arxiv_importer.client.results = Mock(return_value=[mock_result])

        # Test
        result = arxiv_importer.fetch_paper_metadata("2212.06340")
        assert result == mock_result

    def test_fetch_paper_metadata_not_found(self, arxiv_importer):
        """Test fetching metadata for non-existent paper."""
        # Mock empty results
        arxiv_importer.client.results = Mock(return_value=[])

        with pytest.raises(ValueError, match="Paper not found on arXiv"):
            arxiv_importer.fetch_paper_metadata("9999.99999")

    @patch("paperlib.importer.arxiv_importer.tempfile.NamedTemporaryFile")
    def test_download_pdf(self, mock_tempfile, arxiv_importer):
        """Test PDF downloading."""
        # Mock temporary file
        mock_temp_path = Path("./.tmp/mock_temp.pdf")
        mock_tempfile.return_value.__enter__.return_value.name = str(mock_temp_path)

        # Mock arXiv result
        mock_result = Mock()

        # Create actual temp file for test
        with mock_temp_path.open("wb") as f:
            f.write(b"%PDF-1.4\n%%EOF\n")

        try:
            pdf_path = arxiv_importer.download_pdf(mock_result)
            assert pdf_path == mock_temp_path
            mock_result.download_pdf.assert_called_once_with(
                filename=str(mock_temp_path)
            )
        finally:
            if mock_temp_path.exists():
                mock_temp_path.unlink()

    @patch.object(ArxivImporter, "download_pdf")
    @patch.object(ArxivImporter, "fetch_paper_metadata")
    def test_import_arxiv_paper_success(
        self, mock_fetch, mock_download, arxiv_importer
    ):
        """Test successful arXiv paper import."""
        # Mock PDF file
        pdf_file = Path("./.tmp") / "test_arxiv.pdf"
        with pdf_file.open("wb") as f:
            f.write(b"%PDF-1.4\n%%EOF\n")

        try:
            # Mock arXiv result with proper string values
            mock_author = Mock()
            mock_author.name = "Alice Smith"

            mock_result = Mock()
            mock_result.title = "Test ArXiv Paper"
            mock_result.authors = [mock_author]
            mock_result.published = None
            mock_result.updated = None
            mock_result.categories = ["cs.AI"]

            mock_fetch.return_value = mock_result
            mock_download.return_value = pdf_file

            # Test import
            metadata = arxiv_importer.import_arxiv_paper(
                arxiv_input="2212.06340", notes="Test notes", tags=["test"]
            )

            # Check results
            assert metadata.source_type == SourceType.ARXIV
            assert metadata.source_id == "2212.06340"
            assert metadata.title == "Test ArXiv Paper"
            assert metadata.authors == ["Alice Smith"]
            assert metadata.categories == ["cs.AI"]
            assert metadata.notes == "Test notes"
            assert metadata.tags == ["test"]

        finally:
            if pdf_file.exists():
                pdf_file.unlink()

    @patch.object(ArxivImporter, "fetch_paper_metadata")
    def test_import_duplicate_arxiv_paper(self, mock_fetch, arxiv_importer):
        """Test importing the same arXiv paper twice."""
        # Mock first import
        pdf_file = Path("./.tmp") / "test_arxiv_dup.pdf"
        with pdf_file.open("wb") as f:
            f.write(b"%PDF-1.4\n%%EOF\n")

        try:
            with patch.object(ArxivImporter, "download_pdf", return_value=pdf_file):
                mock_result = Mock()
                mock_result.title = "Test Paper"
                mock_result.authors = []
                mock_result.published = None
                mock_result.updated = None
                mock_result.categories = []
                mock_fetch.return_value = mock_result

                # First import should succeed
                arxiv_importer.import_arxiv_paper("2212.06340")

                # Second import should fail
                with pytest.raises(ValueError, match="Paper already imported"):
                    arxiv_importer.import_arxiv_paper("2212.06340")

        finally:
            if pdf_file.exists():
                pdf_file.unlink()