# Integration Guide This document describes how to integrate paperlib with higher-level tools and automation workflows. ## Overview paperlib is designed as a **library engine** that higher-level tools can build upon. It provides: - **Stable CLI interface** with machine-readable JSON output - **File-based storage** that external tools can read directly - **Python API** for programmatic access - **Event hooks** for workflow integration (future) ## CLI Integration ### Machine-Readable Output Most paperlib commands support `--json` output for automation: ```bash # Get library statistics paperlib status --json { "library_root": "/home/user/papers", "total_papers": 42, "by_status": {"converted": 38, "pending": 4}, "last_updated": "2024-01-15T10:30:00Z" } # List papers with metadata paperlib list --json { "papers": [ { "paper_id": "arxiv-2212_06340", "title": "Example Paper", "authors": ["Alice Smith", "Bob Jones"], "categories": ["cs.AI"], "conversion_status": "success", "summary_status": "pending", "imported_at": "2024-01-15T10:30:00Z" } ], "total": 1 } # Import with JSON response paperlib import --arxiv 2212.06340 --json { "success": true, "paper_id": "arxiv-2212_06340", "title": "Example Paper Title", "message": "Successfully imported arXiv paper" } ``` ### Exit Codes paperlib commands follow standard Unix exit code conventions: ```bash paperlib import --arxiv 2212.06340 echo $? # 0 for success, 1 for error # Check if paper exists before processing if paperlib show "$paper_id" --json >/dev/null 2>&1; then echo "Paper exists" else echo "Paper not found" fi ``` ### Scripting Examples #### Daily arXiv Import ```bash #!/bin/bash # daily-arxiv.sh - Import papers from daily arXiv feed LIBRARY="$HOME/research" ARXIV_FEED_URL="http://export.arxiv.org/rss/cs.AI" # Parse RSS feed and extract arXiv IDs curl -s "$ARXIV_FEED_URL" | \ grep -oP 'arxiv\.org/abs/\K[0-9]{4}\.[0-9]{4,5}' | \ while read arxiv_id; do echo "Importing $arxiv_id..." paperlib import --arxiv "$arxiv_id" --library "$LIBRARY" --json done # Convert newly imported papers paperlib convert --library "$LIBRARY" # Generate daily report paperlib list --library "$LIBRARY" --json | \ jq '.papers | map(select(.imported_at | startswith(now | strftime("%Y-%m-%d"))))' ``` #### Batch Processing ```bash #!/bin/bash # batch-process.sh - Process multiple papers from a list LIBRARY="$HOME/research" PAPER_LIST="papers.txt" while IFS= read -r pdf_path; do if [[ -f "$pdf_path" ]]; then echo "Importing $pdf_path..." result=$(paperlib import --pdf "$pdf_path" --library "$LIBRARY" --json) if [[ $? -eq 0 ]]; then paper_id=$(echo "$result" | jq -r '.paper_id') echo "Successfully imported as $paper_id" else echo "Failed to import $pdf_path" fi fi done < "$PAPER_LIST" # Convert all pending papers paperlib convert --library "$LIBRARY" ``` ## Python API ### Direct Library Access ```python from paperlib.config import LibraryPaths from paperlib.storage import PaperStorageManager from paperlib.index import DatabaseManager from paperlib.importer import ArxivImporter, LocalImporter # Initialize library components library_paths = LibraryPaths.from_root("/path/to/library") storage = PaperStorageManager(library_paths) database = DatabaseManager(library_paths) database.initialize_database() # Import paper programmatically arxiv_importer = ArxivImporter(storage) metadata = arxiv_importer.import_arxiv_paper("2212.06340") database.index_paper(metadata) # Search and retrieve results = list(database.search_papers("neural networks")) for result in results: paper = storage.load_paper_metadata(result["paper_id"], result["source_type"]) print(f"{paper.title} by {', '.join(paper.authors)}") # Get statistics stats = database.get_statistics() print(f"Total papers: {stats['total_papers']}") ``` ### Metadata Processing ```python import json from pathlib import Path from paperlib.models import PaperMetadata, PaperSummary # Process all papers in library papers_dir = Path("/home/user/papers/papers") for meta_file in papers_dir.rglob("meta.json"): # Load metadata metadata = PaperMetadata.load_from_file(meta_file) # Check for summary summary_path = meta_file.parent / "summary.json" if summary_path.exists(): summary = PaperSummary.load_from_file(summary_path) # Extract key information tags = summary.problem_tags + summary.technique_tags entities = summary.entities print(f"Paper: {metadata.title}") print(f"Tags: {', '.join(tags)}") print(f"Entities: {', '.join(entities)}") ``` ## File System Integration ### Direct File Access Since paperlib uses a documented file layout, tools can read data directly: ```python import json from pathlib import Path def scan_library(library_root: Path): """Scan library and extract metadata.""" papers = [] for meta_file in library_root.glob("papers/**/meta.json"): with meta_file.open() as f: metadata = json.load(f) papers.append(metadata) return papers def find_papers_by_category(library_root: Path, category: str): """Find papers in a specific category.""" matching_papers = [] for meta_file in library_root.glob("papers/**/meta.json"): with meta_file.open() as f: metadata = json.load(f) if category in metadata.get("categories", []): matching_papers.append(metadata) return matching_papers ``` ### Watch for Changes ```python import time from pathlib import Path from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler class PaperLibraryHandler(FileSystemEventHandler): def __init__(self, library_root): self.library_root = Path(library_root) def on_created(self, event): if event.src_path.endswith("meta.json"): print(f"New paper imported: {event.src_path}") # Trigger processing workflow self.process_new_paper(event.src_path) def on_modified(self, event): if event.src_path.endswith("summary.json"): print(f"Summary updated: {event.src_path}") # Update downstream systems def process_new_paper(self, meta_path): """Handle newly imported paper.""" # Load metadata with open(meta_path) as f: metadata = json.load(f) # Trigger downstream processing # - Send to processing queue # - Update knowledge base # - Generate notifications # Watch library for changes observer = Observer() handler = PaperLibraryHandler("/home/user/papers") observer.schedule(handler, "/home/user/papers/papers", recursive=True) observer.start() ``` ## Higher-Level Tool Examples ### Research Dashboard ```python """research_dashboard.py - Web dashboard for research library""" from flask import Flask, jsonify, render_template from paperlib.config import LibraryPaths from paperlib.storage import PaperStorageManager from paperlib.index import DatabaseManager app = Flask(__name__) # Initialize paperlib components library_paths = LibraryPaths.from_root("/home/user/research") storage = PaperStorageManager(library_paths) database = DatabaseManager(library_paths) @app.route('/api/papers') def list_papers(): """List all papers with metadata.""" papers = list(database.list_papers(limit=50)) return jsonify(papers) @app.route('/api/search/') def search_papers(query): """Search papers by query.""" results = list(database.search_papers(query, limit=20)) return jsonify(results) @app.route('/api/stats') def library_stats(): """Get library statistics.""" stats = database.get_statistics() return jsonify(stats) @app.route('/') def dashboard(): """Main dashboard page.""" return render_template('dashboard.html') if __name__ == '__main__': app.run(debug=True) ``` ### Daily Digest Generator ```python """daily_digest.py - Generate daily research digest""" import json from datetime import datetime, timedelta from pathlib import Path from paperlib.config import LibraryPaths from paperlib.index import DatabaseManager def generate_daily_digest(library_root: str, output_file: str): """Generate digest of recently imported papers.""" # Initialize database library_paths = LibraryPaths.from_root(library_root) database = DatabaseManager(library_paths) # Get papers from last 24 hours yesterday = datetime.now() - timedelta(days=1) yesterday_iso = yesterday.isoformat() recent_papers = [] for paper in database.list_papers(): if paper["imported_at"] >= yesterday_iso: recent_papers.append(paper) if not recent_papers: print("No new papers imported yesterday.") return # Group by category by_category = {} for paper in recent_papers: categories = json.loads(paper["categories_json"]) for category in categories: if category not in by_category: by_category[category] = [] by_category[category].append(paper) # Generate HTML digest html_content = f""" Daily Research Digest - {datetime.now().strftime('%Y-%m-%d')}

Daily Research Digest

Found {len(recent_papers)} new papers

""" for category, papers in by_category.items(): html_content += f"

{category}

" html_content += "" # Write output Path(output_file).write_text(html_content) print(f"Digest written to {output_file}") if __name__ == "__main__": generate_daily_digest("/home/user/research", "digest.html") ``` ### Literature Review Assistant ```python """review_assistant.py - AI-powered literature review helper""" from paperlib.config import LibraryPaths from paperlib.index import DatabaseManager from paperlib.models import PaperSummary class ReviewAssistant: def __init__(self, library_root: str): self.library_paths = LibraryPaths.from_root(library_root) self.database = DatabaseManager(self.library_paths) def find_related_papers(self, paper_id: str, max_results: int = 10): """Find papers related to the given paper.""" # Get source paper metadata source_paper = self.database.get_paper(paper_id) if not source_paper: return [] # Extract search terms from title and categories title_words = source_paper["title"].lower().split() categories = json.loads(source_paper["categories_json"]) # Search for papers with similar keywords search_terms = title_words + categories related_papers = [] for term in search_terms: results = list(self.database.search_papers(term, limit=5)) for result in results: if result["paper_id"] != paper_id: related_papers.append(result) # Remove duplicates and return top results seen_ids = set() unique_papers = [] for paper in related_papers: if paper["paper_id"] not in seen_ids: seen_ids.add(paper["paper_id"]) unique_papers.append(paper) if len(unique_papers) >= max_results: break return unique_papers def generate_topic_overview(self, topic: str): """Generate overview of papers on a specific topic.""" # Search for papers on topic papers = list(self.database.search_papers(topic, limit=50)) if not papers: return f"No papers found for topic: {topic}" # Analyze summaries if available key_entities = set() techniques = set() for paper in papers: summary_path = Path(paper["summary_json_path"]) if summary_path.exists(): summary = PaperSummary.load_from_file(summary_path) key_entities.update(summary.entities) techniques.update(summary.technique_tags) # Generate overview overview = f""" Topic: {topic} Papers found: {len(papers)} Key entities mentioned: {', '.join(sorted(key_entities)[:10])} Common techniques: {', '.join(sorted(techniques)[:10])} Recent papers: """ # Add recent papers recent_papers = sorted(papers, key=lambda x: x["imported_at"], reverse=True)[:5] for paper in recent_papers: overview += f"\n- {paper['title']} ({paper['paper_id']})" return overview # Usage assistant = ReviewAssistant("/home/user/research") overview = assistant.generate_topic_overview("transformer architecture") print(overview) ``` ## Integration Patterns ### Pipeline Processing ```bash # Multi-stage processing pipeline paperlib import --arxiv 2212.06340 --json > import_result.json paper_id=$(jq -r '.paper_id' import_result.json) # Convert to markdown paperlib convert --paper-id "$paper_id" # Generate summary (when available) # paperlib summarize --paper-id "$paper_id" # Update downstream systems curl -X POST "http://research-db/api/papers" \ -H "Content-Type: application/json" \ -d @import_result.json ``` ### Event-Driven Architecture ```python """event_handler.py - Process paperlib events""" import json from pathlib import Path import pika # RabbitMQ client class PaperLibraryEventHandler: def __init__(self, rabbitmq_url: str): self.connection = pika.BlockingConnection(pika.URLParameters(rabbitmq_url)) self.channel = self.connection.channel() def on_paper_imported(self, paper_metadata: dict): """Handle new paper import.""" message = { "event": "paper_imported", "paper_id": paper_metadata["paper_id"], "title": paper_metadata["title"], "categories": paper_metadata["categories"], "timestamp": paper_metadata["imported_at"] } # Send to processing queue self.channel.basic_publish( exchange='', routing_key='paper_processing', body=json.dumps(message) ) def on_summary_generated(self, paper_id: str, summary_path: Path): """Handle summary generation.""" with summary_path.open() as f: summary = json.load(f) message = { "event": "summary_generated", "paper_id": paper_id, "tags": summary["problem_tags"] + summary["technique_tags"], "entities": summary["entities"] } # Send to indexing service self.channel.basic_publish( exchange='', routing_key='summary_indexing', body=json.dumps(message) ) ``` ## Best Practices ### Error Handling ```python import subprocess import json def safe_paperlib_command(command: list[str]) -> dict: """Execute paperlib command with proper error handling.""" try: result = subprocess.run( ["paperlib"] + command + ["--json"], capture_output=True, text=True, check=True ) return json.loads(result.stdout) except subprocess.CalledProcessError as e: return { "success": False, "error": e.stderr, "exit_code": e.returncode } except json.JSONDecodeError as e: return { "success": False, "error": f"Invalid JSON response: {e}", "raw_output": result.stdout } # Usage result = safe_paperlib_command(["import", "--arxiv", "2212.06340"]) if result.get("success", True): # Assume success if no "success" field print(f"Imported paper: {result['paper_id']}") else: print(f"Import failed: {result['error']}") ``` ### Performance Optimization ```python # Batch operations for better performance from paperlib.index import DatabaseManager def batch_index_papers(library_root: str, paper_ids: list[str]): """Index multiple papers efficiently.""" database = DatabaseManager(LibraryPaths.from_root(library_root)) storage = PaperStorageManager(LibraryPaths.from_root(library_root)) # Begin transaction for batch insert with database._get_connection() as conn: for paper_id in paper_ids: metadata = storage.load_paper_metadata(paper_id, source_type) if metadata: database.index_paper(metadata) # Automatic commit on context exit ``` ### Configuration Management ```python # config_manager.py - Centralized configuration import os from pathlib import Path class ConfigManager: def __init__(self): self.library_root = os.getenv("PAPERLIB_ROOT", Path.home() / "research") self.api_keys = { "openai": os.getenv("OPENAI_API_KEY"), "anthropic": os.getenv("ANTHROPIC_API_KEY") } def get_library_path(self, name: str = "default") -> str: """Get library path by name.""" if name == "default": return str(self.library_root) return str(Path.home() / f"research-{name}") def paperlib_command_base(self, library_name: str = "default") -> list[str]: """Get base command for paperlib with library.""" return ["paperlib", "--library", self.get_library_path(library_name)] config = ConfigManager() # Usage in scripts import subprocess cmd = config.paperlib_command_base("arxiv") + ["list", "--json"] result = subprocess.run(cmd, capture_output=True, text=True) ``` This integration guide provides the foundation for building sophisticated research workflows on top of paperlib's stable, local-first architecture.