test: add tests

2026-04-17 15:56:04 -04:00
parent 088e07dee8
commit 74d140e5f8
10 changed files with 1659 additions and 0 deletions
@@ -0,0 +1,273 @@
+"""Tests for paperlib import functionality."""
+
+import shutil
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+import pytest
+
+from paperlib.config import LibraryPaths
+from paperlib.importer import ArxivImporter, LocalImporter
+from paperlib.models import SourceType
+from paperlib.storage import PaperStorageManager
+
+
+class TestLocalImporter:
+    """Test LocalImporter functionality."""
+
+    @pytest.fixture
+    def temp_library(self):
+        """Create a temporary library for testing."""
+        temp_dir = Path("./.tmp") / f"test_import_{hash(self)}"
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        library_paths = LibraryPaths.from_root(temp_dir)
+        library_paths.create_directories()
+
+        yield library_paths
+
+        # Cleanup
+        if temp_dir.exists():
+            shutil.rmtree(temp_dir)
+
+    @pytest.fixture
+    def local_importer(self, temp_library):
+        """Create a LocalImporter for testing."""
+        storage_manager = PaperStorageManager(temp_library)
+        return LocalImporter(storage_manager)
+
+    @pytest.fixture
+    def sample_pdf(self):
+        """Create a sample PDF file for testing."""
+        pdf_file = Path("./.tmp") / f"sample_{hash(self)}.pdf"
+        with pdf_file.open("wb") as f:
+            # Minimal PDF content
+            f.write(b"%PDF-1.4\n")
+            f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
+            f.write(b"%%EOF\n")
+
+        yield pdf_file
+
+        # Cleanup
+        if pdf_file.exists():
+            pdf_file.unlink()
+
+    def test_import_pdf_success(self, local_importer, sample_pdf):
+        """Test successful PDF import."""
+        metadata = local_importer.import_pdf(
+            pdf_path=sample_pdf,
+            title="Test Paper",
+            notes="Test notes",
+            tags=["test", "sample"],
+        )
+
+        # Check metadata
+        assert metadata.source_type == SourceType.LOCAL
+        assert metadata.title == "Test Paper"
+        assert metadata.notes == "Test notes"
+        assert metadata.tags == ["test", "sample"]
+        assert metadata.paper_id.startswith("local-")
+
+    def test_import_pdf_auto_title(self, local_importer, sample_pdf):
+        """Test PDF import with auto-generated title."""
+        # Rename PDF to have a meaningful name
+        meaningful_pdf = sample_pdf.parent / "Machine_Learning-Paper.pdf"
+        sample_pdf.rename(meaningful_pdf)
+
+        try:
+            metadata = local_importer.import_pdf(pdf_path=meaningful_pdf)
+
+            # Title should be auto-generated from filename
+            assert metadata.title == "Machine Learning Paper"
+
+        finally:
+            if meaningful_pdf.exists():
+                meaningful_pdf.unlink()
+
+    def test_import_nonexistent_pdf(self, local_importer):
+        """Test importing non-existent PDF file."""
+        nonexistent = Path("./.tmp/nonexistent.pdf")
+
+        with pytest.raises(FileNotFoundError):
+            local_importer.import_pdf(pdf_path=nonexistent)
+
+    def test_import_non_pdf_file(self, local_importer):
+        """Test importing non-PDF file."""
+        text_file = Path("./.tmp") / "not_a_pdf.txt"
+        with text_file.open("w") as f:
+            f.write("This is not a PDF")
+
+        try:
+            with pytest.raises(ValueError, match="File is not a PDF"):
+                local_importer.import_pdf(pdf_path=text_file)
+        finally:
+            if text_file.exists():
+                text_file.unlink()
+
+    def test_import_duplicate_pdf(self, local_importer, sample_pdf):
+        """Test importing the same PDF twice."""
+        # Import once
+        metadata1 = local_importer.import_pdf(pdf_path=sample_pdf)
+
+        # Try to import again
+        with pytest.raises(ValueError, match="Paper already imported"):
+            local_importer.import_pdf(pdf_path=sample_pdf)
+
+
+class TestArxivImporter:
+    """Test ArxivImporter functionality."""
+
+    @pytest.fixture
+    def temp_library(self):
+        """Create a temporary library for testing."""
+        temp_dir = Path("./.tmp") / f"test_arxiv_{hash(self)}"
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        library_paths = LibraryPaths.from_root(temp_dir)
+        library_paths.create_directories()
+
+        yield library_paths
+
+        # Cleanup
+        if temp_dir.exists():
+            shutil.rmtree(temp_dir)
+
+    @pytest.fixture
+    def arxiv_importer(self, temp_library):
+        """Create an ArxivImporter for testing."""
+        storage_manager = PaperStorageManager(temp_library)
+        return ArxivImporter(storage_manager)
+
+    def test_extract_arxiv_id_clean(self, arxiv_importer):
+        """Test extracting clean arXiv ID."""
+        # Test various formats
+        assert arxiv_importer.extract_arxiv_id("2212.06340") == "2212.06340"
+        assert arxiv_importer.extract_arxiv_id("arxiv:2212.06340") == "2212.06340"
+        assert arxiv_importer.extract_arxiv_id("2212.06340v1") == "2212.06340v1"
+        assert arxiv_importer.extract_arxiv_id("math-ph/0701002") == "math-ph/0701002"
+
+    def test_extract_arxiv_id_from_url(self, arxiv_importer):
+        """Test extracting arXiv ID from URLs."""
+        url = "https://arxiv.org/abs/2212.06340"
+        extracted = arxiv_importer.extract_arxiv_id(url)
+        assert extracted == "2212.06340"
+
+    def test_fetch_paper_metadata_success(self, arxiv_importer):
+        """Test successful metadata fetching from arXiv."""
+        # Mock arXiv result
+        mock_result = Mock()
+        mock_result.title = "Test Paper"
+        mock_result.authors = [Mock(name="Alice Smith"), Mock(name="Bob Jones")]
+        mock_result.published = Mock()
+        mock_result.updated = Mock()
+        mock_result.categories = ["cs.AI", "stat.ML"]
+
+        # Mock the client's results method directly
+        arxiv_importer.client.results = Mock(return_value=[mock_result])
+
+        # Test
+        result = arxiv_importer.fetch_paper_metadata("2212.06340")
+        assert result == mock_result
+
+    def test_fetch_paper_metadata_not_found(self, arxiv_importer):
+        """Test fetching metadata for non-existent paper."""
+        # Mock empty results
+        arxiv_importer.client.results = Mock(return_value=[])
+
+        with pytest.raises(ValueError, match="Paper not found on arXiv"):
+            arxiv_importer.fetch_paper_metadata("9999.99999")
+
+    @patch("paperlib.importer.arxiv_importer.tempfile.NamedTemporaryFile")
+    def test_download_pdf(self, mock_tempfile, arxiv_importer):
+        """Test PDF downloading."""
+        # Mock temporary file
+        mock_temp_path = Path("./.tmp/mock_temp.pdf")
+        mock_tempfile.return_value.__enter__.return_value.name = str(mock_temp_path)
+
+        # Mock arXiv result
+        mock_result = Mock()
+
+        # Create actual temp file for test
+        with mock_temp_path.open("wb") as f:
+            f.write(b"%PDF-1.4\n%%EOF\n")
+
+        try:
+            pdf_path = arxiv_importer.download_pdf(mock_result)
+            assert pdf_path == mock_temp_path
+            mock_result.download_pdf.assert_called_once_with(
+                filename=str(mock_temp_path)
+            )
+        finally:
+            if mock_temp_path.exists():
+                mock_temp_path.unlink()
+
+    @patch.object(ArxivImporter, "download_pdf")
+    @patch.object(ArxivImporter, "fetch_paper_metadata")
+    def test_import_arxiv_paper_success(
+        self, mock_fetch, mock_download, arxiv_importer
+    ):
+        """Test successful arXiv paper import."""
+        # Mock PDF file
+        pdf_file = Path("./.tmp") / "test_arxiv.pdf"
+        with pdf_file.open("wb") as f:
+            f.write(b"%PDF-1.4\n%%EOF\n")
+
+        try:
+            # Mock arXiv result with proper string values
+            mock_author = Mock()
+            mock_author.name = "Alice Smith"
+
+            mock_result = Mock()
+            mock_result.title = "Test ArXiv Paper"
+            mock_result.authors = [mock_author]
+            mock_result.published = None
+            mock_result.updated = None
+            mock_result.categories = ["cs.AI"]
+
+            mock_fetch.return_value = mock_result
+            mock_download.return_value = pdf_file
+
+            # Test import
+            metadata = arxiv_importer.import_arxiv_paper(
+                arxiv_input="2212.06340", notes="Test notes", tags=["test"]
+            )
+
+            # Check results
+            assert metadata.source_type == SourceType.ARXIV
+            assert metadata.source_id == "2212.06340"
+            assert metadata.title == "Test ArXiv Paper"
+            assert metadata.authors == ["Alice Smith"]
+            assert metadata.categories == ["cs.AI"]
+            assert metadata.notes == "Test notes"
+            assert metadata.tags == ["test"]
+
+        finally:
+            if pdf_file.exists():
+                pdf_file.unlink()
+
+    @patch.object(ArxivImporter, "fetch_paper_metadata")
+    def test_import_duplicate_arxiv_paper(self, mock_fetch, arxiv_importer):
+        """Test importing the same arXiv paper twice."""
+        # Mock first import
+        pdf_file = Path("./.tmp") / "test_arxiv_dup.pdf"
+        with pdf_file.open("wb") as f:
+            f.write(b"%PDF-1.4\n%%EOF\n")
+
+        try:
+            with patch.object(ArxivImporter, "download_pdf", return_value=pdf_file):
+                mock_result = Mock()
+                mock_result.title = "Test Paper"
+                mock_result.authors = []
+                mock_result.published = None
+                mock_result.updated = None
+                mock_result.categories = []
+                mock_fetch.return_value = mock_result
+
+                # First import should succeed
+                arxiv_importer.import_arxiv_paper("2212.06340")
+
+                # Second import should fail
+                with pytest.raises(ValueError, match="Paper already imported"):
+                    arxiv_importer.import_arxiv_paper("2212.06340")
+
+        finally:
+            if pdf_file.exists():
+                pdf_file.unlink()