Files
paperlib/docs/integration-guide.md
T
2026-04-17 16:54:30 -04:00

638 lines
18 KiB
Markdown

# Integration Guide
This document describes how to integrate paperlib with higher-level tools and automation workflows.
## Overview
paperlib is designed as a **library engine** that higher-level tools can build upon. It provides:
- **Stable CLI interface** with machine-readable JSON output
- **File-based storage** that external tools can read directly
- **Python API** for programmatic access
- **Event hooks** for workflow integration (future)
## CLI Integration
### Machine-Readable Output
Most paperlib commands support `--json` output for automation:
```bash
# Get library statistics
paperlib status --json
{
"library_root": "/home/user/papers",
"total_papers": 42,
"by_status": {"converted": 38, "pending": 4},
"last_updated": "2024-01-15T10:30:00Z"
}
# List papers with metadata
paperlib list --json
{
"papers": [
{
"paper_id": "arxiv-2212_06340",
"title": "Example Paper",
"authors": ["Alice Smith", "Bob Jones"],
"categories": ["cs.AI"],
"conversion_status": "success",
"summary_status": "pending",
"imported_at": "2024-01-15T10:30:00Z"
}
],
"total": 1
}
# Import with JSON response
paperlib import --arxiv 2212.06340 --json
{
"success": true,
"paper_id": "arxiv-2212_06340",
"title": "Example Paper Title",
"message": "Successfully imported arXiv paper"
}
```
### Exit Codes
paperlib commands follow standard Unix exit code conventions:
```bash
paperlib import --arxiv 2212.06340
echo $? # 0 for success, 1 for error
# Check if paper exists before processing
if paperlib show "$paper_id" --json >/dev/null 2>&1; then
echo "Paper exists"
else
echo "Paper not found"
fi
```
### Scripting Examples
#### Daily arXiv Import
```bash
#!/bin/bash
# daily-arxiv.sh - Import papers from daily arXiv feed
LIBRARY="$HOME/research"
ARXIV_FEED_URL="http://export.arxiv.org/rss/cs.AI"
# Parse RSS feed and extract arXiv IDs
curl -s "$ARXIV_FEED_URL" | \
grep -oP 'arxiv\.org/abs/\K[0-9]{4}\.[0-9]{4,5}' | \
while read arxiv_id; do
echo "Importing $arxiv_id..."
paperlib import --arxiv "$arxiv_id" --library "$LIBRARY" --json
done
# Convert newly imported papers
paperlib convert --library "$LIBRARY"
# Generate daily report
paperlib list --library "$LIBRARY" --json | \
jq '.papers | map(select(.imported_at | startswith(now | strftime("%Y-%m-%d"))))'
```
#### Batch Processing
```bash
#!/bin/bash
# batch-process.sh - Process multiple papers from a list
LIBRARY="$HOME/research"
PAPER_LIST="papers.txt"
while IFS= read -r pdf_path; do
if [[ -f "$pdf_path" ]]; then
echo "Importing $pdf_path..."
result=$(paperlib import --pdf "$pdf_path" --library "$LIBRARY" --json)
if [[ $? -eq 0 ]]; then
paper_id=$(echo "$result" | jq -r '.paper_id')
echo "Successfully imported as $paper_id"
else
echo "Failed to import $pdf_path"
fi
fi
done < "$PAPER_LIST"
# Convert all pending papers
paperlib convert --library "$LIBRARY"
```
## Python API
### Direct Library Access
```python
from paperlib.config import LibraryPaths
from paperlib.storage import PaperStorageManager
from paperlib.index import DatabaseManager
from paperlib.importer import ArxivImporter, LocalImporter
# Initialize library components
library_paths = LibraryPaths.from_root("/path/to/library")
storage = PaperStorageManager(library_paths)
database = DatabaseManager(library_paths)
database.initialize_database()
# Import paper programmatically
arxiv_importer = ArxivImporter(storage)
metadata = arxiv_importer.import_arxiv_paper("2212.06340")
database.index_paper(metadata)
# Search and retrieve
results = list(database.search_papers("neural networks"))
for result in results:
paper = storage.load_paper_metadata(result["paper_id"], result["source_type"])
print(f"{paper.title} by {', '.join(paper.authors)}")
# Get statistics
stats = database.get_statistics()
print(f"Total papers: {stats['total_papers']}")
```
### Metadata Processing
```python
import json
from pathlib import Path
from paperlib.models import PaperMetadata, PaperSummary
# Process all papers in library
papers_dir = Path("/home/user/papers/papers")
for meta_file in papers_dir.rglob("meta.json"):
# Load metadata
metadata = PaperMetadata.load_from_file(meta_file)
# Check for summary
summary_path = meta_file.parent / "summary.json"
if summary_path.exists():
summary = PaperSummary.load_from_file(summary_path)
# Extract key information
tags = summary.problem_tags + summary.technique_tags
entities = summary.entities
print(f"Paper: {metadata.title}")
print(f"Tags: {', '.join(tags)}")
print(f"Entities: {', '.join(entities)}")
```
## File System Integration
### Direct File Access
Since paperlib uses a documented file layout, tools can read data directly:
```python
import json
from pathlib import Path
def scan_library(library_root: Path):
"""Scan library and extract metadata."""
papers = []
for meta_file in library_root.glob("papers/**/meta.json"):
with meta_file.open() as f:
metadata = json.load(f)
papers.append(metadata)
return papers
def find_papers_by_category(library_root: Path, category: str):
"""Find papers in a specific category."""
matching_papers = []
for meta_file in library_root.glob("papers/**/meta.json"):
with meta_file.open() as f:
metadata = json.load(f)
if category in metadata.get("categories", []):
matching_papers.append(metadata)
return matching_papers
```
### Watch for Changes
```python
import time
from pathlib import Path
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
class PaperLibraryHandler(FileSystemEventHandler):
def __init__(self, library_root):
self.library_root = Path(library_root)
def on_created(self, event):
if event.src_path.endswith("meta.json"):
print(f"New paper imported: {event.src_path}")
# Trigger processing workflow
self.process_new_paper(event.src_path)
def on_modified(self, event):
if event.src_path.endswith("summary.json"):
print(f"Summary updated: {event.src_path}")
# Update downstream systems
def process_new_paper(self, meta_path):
"""Handle newly imported paper."""
# Load metadata
with open(meta_path) as f:
metadata = json.load(f)
# Trigger downstream processing
# - Send to processing queue
# - Update knowledge base
# - Generate notifications
# Watch library for changes
observer = Observer()
handler = PaperLibraryHandler("/home/user/papers")
observer.schedule(handler, "/home/user/papers/papers", recursive=True)
observer.start()
```
## Higher-Level Tool Examples
### Research Dashboard
```python
"""research_dashboard.py - Web dashboard for research library"""
from flask import Flask, jsonify, render_template
from paperlib.config import LibraryPaths
from paperlib.storage import PaperStorageManager
from paperlib.index import DatabaseManager
app = Flask(__name__)
# Initialize paperlib components
library_paths = LibraryPaths.from_root("/home/user/research")
storage = PaperStorageManager(library_paths)
database = DatabaseManager(library_paths)
@app.route('/api/papers')
def list_papers():
"""List all papers with metadata."""
papers = list(database.list_papers(limit=50))
return jsonify(papers)
@app.route('/api/search/<query>')
def search_papers(query):
"""Search papers by query."""
results = list(database.search_papers(query, limit=20))
return jsonify(results)
@app.route('/api/stats')
def library_stats():
"""Get library statistics."""
stats = database.get_statistics()
return jsonify(stats)
@app.route('/')
def dashboard():
"""Main dashboard page."""
return render_template('dashboard.html')
if __name__ == '__main__':
app.run(debug=True)
```
### Daily Digest Generator
```python
"""daily_digest.py - Generate daily research digest"""
import json
from datetime import datetime, timedelta
from pathlib import Path
from paperlib.config import LibraryPaths
from paperlib.index import DatabaseManager
def generate_daily_digest(library_root: str, output_file: str):
"""Generate digest of recently imported papers."""
# Initialize database
library_paths = LibraryPaths.from_root(library_root)
database = DatabaseManager(library_paths)
# Get papers from last 24 hours
yesterday = datetime.now() - timedelta(days=1)
yesterday_iso = yesterday.isoformat()
recent_papers = []
for paper in database.list_papers():
if paper["imported_at"] >= yesterday_iso:
recent_papers.append(paper)
if not recent_papers:
print("No new papers imported yesterday.")
return
# Group by category
by_category = {}
for paper in recent_papers:
categories = json.loads(paper["categories_json"])
for category in categories:
if category not in by_category:
by_category[category] = []
by_category[category].append(paper)
# Generate HTML digest
html_content = f"""
<html>
<head><title>Daily Research Digest - {datetime.now().strftime('%Y-%m-%d')}</title></head>
<body>
<h1>Daily Research Digest</h1>
<p>Found {len(recent_papers)} new papers</p>
"""
for category, papers in by_category.items():
html_content += f"<h2>{category}</h2><ul>"
for paper in papers:
title = paper["title"]
paper_id = paper["paper_id"]
html_content += f'<li><strong>{title}</strong> ({paper_id})</li>'
html_content += "</ul>"
html_content += "</body></html>"
# Write output
Path(output_file).write_text(html_content)
print(f"Digest written to {output_file}")
if __name__ == "__main__":
generate_daily_digest("/home/user/research", "digest.html")
```
### Literature Review Assistant
```python
"""review_assistant.py - AI-powered literature review helper"""
from paperlib.config import LibraryPaths
from paperlib.index import DatabaseManager
from paperlib.models import PaperSummary
class ReviewAssistant:
def __init__(self, library_root: str):
self.library_paths = LibraryPaths.from_root(library_root)
self.database = DatabaseManager(self.library_paths)
def find_related_papers(self, paper_id: str, max_results: int = 10):
"""Find papers related to the given paper."""
# Get source paper metadata
source_paper = self.database.get_paper(paper_id)
if not source_paper:
return []
# Extract search terms from title and categories
title_words = source_paper["title"].lower().split()
categories = json.loads(source_paper["categories_json"])
# Search for papers with similar keywords
search_terms = title_words + categories
related_papers = []
for term in search_terms:
results = list(self.database.search_papers(term, limit=5))
for result in results:
if result["paper_id"] != paper_id:
related_papers.append(result)
# Remove duplicates and return top results
seen_ids = set()
unique_papers = []
for paper in related_papers:
if paper["paper_id"] not in seen_ids:
seen_ids.add(paper["paper_id"])
unique_papers.append(paper)
if len(unique_papers) >= max_results:
break
return unique_papers
def generate_topic_overview(self, topic: str):
"""Generate overview of papers on a specific topic."""
# Search for papers on topic
papers = list(self.database.search_papers(topic, limit=50))
if not papers:
return f"No papers found for topic: {topic}"
# Analyze summaries if available
key_entities = set()
techniques = set()
for paper in papers:
summary_path = Path(paper["summary_json_path"])
if summary_path.exists():
summary = PaperSummary.load_from_file(summary_path)
key_entities.update(summary.entities)
techniques.update(summary.technique_tags)
# Generate overview
overview = f"""
Topic: {topic}
Papers found: {len(papers)}
Key entities mentioned:
{', '.join(sorted(key_entities)[:10])}
Common techniques:
{', '.join(sorted(techniques)[:10])}
Recent papers:
"""
# Add recent papers
recent_papers = sorted(papers, key=lambda x: x["imported_at"], reverse=True)[:5]
for paper in recent_papers:
overview += f"\n- {paper['title']} ({paper['paper_id']})"
return overview
# Usage
assistant = ReviewAssistant("/home/user/research")
overview = assistant.generate_topic_overview("transformer architecture")
print(overview)
```
## Integration Patterns
### Pipeline Processing
```bash
# Multi-stage processing pipeline
paperlib import --arxiv 2212.06340 --json > import_result.json
paper_id=$(jq -r '.paper_id' import_result.json)
# Convert to markdown
paperlib convert --paper-id "$paper_id"
# Generate summary (when available)
# paperlib summarize --paper-id "$paper_id"
# Update downstream systems
curl -X POST "http://research-db/api/papers" \
-H "Content-Type: application/json" \
-d @import_result.json
```
### Event-Driven Architecture
```python
"""event_handler.py - Process paperlib events"""
import json
from pathlib import Path
import pika # RabbitMQ client
class PaperLibraryEventHandler:
def __init__(self, rabbitmq_url: str):
self.connection = pika.BlockingConnection(pika.URLParameters(rabbitmq_url))
self.channel = self.connection.channel()
def on_paper_imported(self, paper_metadata: dict):
"""Handle new paper import."""
message = {
"event": "paper_imported",
"paper_id": paper_metadata["paper_id"],
"title": paper_metadata["title"],
"categories": paper_metadata["categories"],
"timestamp": paper_metadata["imported_at"]
}
# Send to processing queue
self.channel.basic_publish(
exchange='',
routing_key='paper_processing',
body=json.dumps(message)
)
def on_summary_generated(self, paper_id: str, summary_path: Path):
"""Handle summary generation."""
with summary_path.open() as f:
summary = json.load(f)
message = {
"event": "summary_generated",
"paper_id": paper_id,
"tags": summary["problem_tags"] + summary["technique_tags"],
"entities": summary["entities"]
}
# Send to indexing service
self.channel.basic_publish(
exchange='',
routing_key='summary_indexing',
body=json.dumps(message)
)
```
## Best Practices
### Error Handling
```python
import subprocess
import json
def safe_paperlib_command(command: list[str]) -> dict:
"""Execute paperlib command with proper error handling."""
try:
result = subprocess.run(
["paperlib"] + command + ["--json"],
capture_output=True,
text=True,
check=True
)
return json.loads(result.stdout)
except subprocess.CalledProcessError as e:
return {
"success": False,
"error": e.stderr,
"exit_code": e.returncode
}
except json.JSONDecodeError as e:
return {
"success": False,
"error": f"Invalid JSON response: {e}",
"raw_output": result.stdout
}
# Usage
result = safe_paperlib_command(["import", "--arxiv", "2212.06340"])
if result.get("success", True): # Assume success if no "success" field
print(f"Imported paper: {result['paper_id']}")
else:
print(f"Import failed: {result['error']}")
```
### Performance Optimization
```python
# Batch operations for better performance
from paperlib.index import DatabaseManager
def batch_index_papers(library_root: str, paper_ids: list[str]):
"""Index multiple papers efficiently."""
database = DatabaseManager(LibraryPaths.from_root(library_root))
storage = PaperStorageManager(LibraryPaths.from_root(library_root))
# Begin transaction for batch insert
with database._get_connection() as conn:
for paper_id in paper_ids:
metadata = storage.load_paper_metadata(paper_id, source_type)
if metadata:
database.index_paper(metadata)
# Automatic commit on context exit
```
### Configuration Management
```python
# config_manager.py - Centralized configuration
import os
from pathlib import Path
class ConfigManager:
def __init__(self):
self.library_root = os.getenv("PAPERLIB_ROOT", Path.home() / "research")
self.api_keys = {
"openai": os.getenv("OPENAI_API_KEY"),
"anthropic": os.getenv("ANTHROPIC_API_KEY")
}
def get_library_path(self, name: str = "default") -> str:
"""Get library path by name."""
if name == "default":
return str(self.library_root)
return str(Path.home() / f"research-{name}")
def paperlib_command_base(self, library_name: str = "default") -> list[str]:
"""Get base command for paperlib with library."""
return ["paperlib", "--library", self.get_library_path(library_name)]
config = ConfigManager()
# Usage in scripts
import subprocess
cmd = config.paperlib_command_base("arxiv") + ["list", "--json"]
result = subprocess.run(cmd, capture_output=True, text=True)
```
This integration guide provides the foundation for building sophisticated research workflows on top of paperlib's stable, local-first architecture.