"""Integration tests for paperlib.""" import shutil from pathlib import Path import pytest from paperlib.config import LibraryPaths from paperlib.converter import MinerUConverter from paperlib.importer import ArxivImporter, LocalImporter from paperlib.index import DatabaseManager from paperlib.models import ConversionStatus, SourceType from paperlib.storage import PaperStorageManager class TestIntegration: """Test full integration workflows.""" @pytest.fixture def temp_library(self): """Create a temporary library for testing.""" temp_dir = Path("./.tmp") / f"test_integration_{hash(self)}" temp_dir.mkdir(parents=True, exist_ok=True) library_paths = LibraryPaths.from_root(temp_dir) library_paths.create_directories() yield library_paths # Cleanup if temp_dir.exists(): shutil.rmtree(temp_dir) @pytest.fixture def sample_pdf(self): """Create a sample PDF file for testing.""" pdf_file = Path("./.tmp") / f"integration_test_{hash(self)}.pdf" with pdf_file.open("wb") as f: # Minimal PDF content f.write(b"%PDF-1.4\n") f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") f.write(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n") f.write(b"3 0 obj\n<< /Type /Page /Parent 2 0 R >>\nendobj\n") f.write(b"%%EOF\n") yield pdf_file # Cleanup if pdf_file.exists(): pdf_file.unlink() def test_complete_local_import_workflow(self, temp_library, sample_pdf): """Test complete workflow for importing and managing a local PDF.""" # Set up components storage_manager = PaperStorageManager(temp_library) db_manager = DatabaseManager(temp_library) local_importer = LocalImporter(storage_manager) # Initialize database db_manager.initialize_database() # Import PDF metadata = local_importer.import_pdf( pdf_path=sample_pdf, title="Integration Test Paper", tags=["integration", "test"], notes="This is an integration test paper", ) # Update metadata with authors after import metadata.authors = ["Test Author"] storage_manager.update_paper_metadata(metadata) # Verify metadata assert metadata.source_type == SourceType.LOCAL assert metadata.title == "Integration Test Paper" assert metadata.authors == ["Test Author"] assert metadata.tags == ["integration", "test"] # Index in database db_manager.index_paper(metadata) # Test retrieval from database retrieved_paper = db_manager.get_paper(metadata.paper_id) assert retrieved_paper is not None assert retrieved_paper["title"] == "Integration Test Paper" # Test search functionality search_results = list(db_manager.search_papers("Integration Test")) assert len(search_results) == 1 assert search_results[0]["paper_id"] == metadata.paper_id # Test field search author_results = list(db_manager.search_by_field("author_list", "Test Author")) assert len(author_results) == 1 # Test listing papers all_papers = list(db_manager.list_papers()) assert len(all_papers) == 1 assert all_papers[0]["paper_id"] == metadata.paper_id # Test statistics stats = db_manager.get_statistics() assert stats["total_papers"] == 1 assert stats["by_source_type"]["local"] == 1 # Test updating metadata metadata.notes = "Updated notes" storage_manager.update_paper_metadata(metadata) # Re-index and verify update db_manager.index_paper(metadata) updated_paper = db_manager.get_paper(metadata.paper_id) assert "Updated notes" in updated_paper["search_text"] def test_multiple_papers_workflow(self, temp_library, sample_pdf): """Test workflow with multiple papers.""" # Set up components storage_manager = PaperStorageManager(temp_library) db_manager = DatabaseManager(temp_library) local_importer = LocalImporter(storage_manager) # Initialize database db_manager.initialize_database() # Import multiple papers (create unique PDFs) papers = [] for i in range(3): # Create unique PDF for each import unique_pdf = Path("./.tmp") / f"unique_paper_{i}_{hash(self)}.pdf" with unique_pdf.open("wb") as f: f.write(b"%PDF-1.4\n") f.write(f"% Unique content {i}\n".encode()) f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") f.write(b"%%EOF\n") try: metadata = local_importer.import_pdf( pdf_path=unique_pdf, title=f"Test Paper {i + 1}", tags=[f"tag{i + 1}", "common"], notes=f"Notes for paper {i + 1}", ) # Update metadata with authors after import metadata.authors = [f"Author {i + 1}"] storage_manager.update_paper_metadata(metadata) papers.append(metadata) db_manager.index_paper(metadata) finally: if unique_pdf.exists(): unique_pdf.unlink() # Test listing all papers all_papers = list(db_manager.list_papers()) assert len(all_papers) == 3 # Test search across papers common_tag_results = list(db_manager.search_papers("common")) assert len(common_tag_results) == 3 # Test filtering filtered_results = list(db_manager.list_papers(limit=2)) assert len(filtered_results) == 2 # Test reindexing success_count, error_count = db_manager.reindex_from_storage(storage_manager) assert success_count == 3 assert error_count == 0 # Verify papers still exist after reindex stats = db_manager.get_statistics() assert stats["total_papers"] == 3 def test_storage_and_database_consistency(self, temp_library, sample_pdf): """Test consistency between storage and database.""" # Set up components storage_manager = PaperStorageManager(temp_library) db_manager = DatabaseManager(temp_library) local_importer = LocalImporter(storage_manager) # Initialize database db_manager.initialize_database() # Import paper metadata = local_importer.import_pdf( pdf_path=sample_pdf, title="Consistency Test Paper", ) # Index in database db_manager.index_paper(metadata) # Verify file exists in storage assert storage_manager.paper_exists(metadata.paper_id, metadata.source_type) # Verify paper exists in database db_paper = db_manager.get_paper(metadata.paper_id) assert db_paper is not None # Load from storage and compare storage_metadata = storage_manager.load_paper_metadata( metadata.paper_id, metadata.source_type ) assert storage_metadata.title == db_paper["title"] assert storage_metadata.paper_id == db_paper["paper_id"] # Test reindexing maintains consistency db_manager.remove_paper(metadata.paper_id) assert db_manager.get_paper(metadata.paper_id) is None # Reindex from storage success_count, error_count = db_manager.reindex_from_storage(storage_manager) assert success_count == 1 assert error_count == 0 # Verify paper is back in database restored_paper = db_manager.get_paper(metadata.paper_id) assert restored_paper is not None assert restored_paper["title"] == "Consistency Test Paper"