686 lines
19 KiB
Markdown
686 lines
19 KiB
Markdown
# Integration Guide
|
|
|
|
This document describes how to integrate paperlib with higher-level tools and automation workflows.
|
|
|
|
## Overview
|
|
|
|
paperlib is designed as a **library engine** that higher-level tools can build upon. It provides:
|
|
|
|
- **Stable CLI interface** with machine-readable JSON output
|
|
- **File-based storage** that external tools can read directly
|
|
- **Python API** for programmatic access
|
|
- **Event hooks** for workflow integration (future)
|
|
|
|
## CLI Integration
|
|
|
|
### Machine-Readable Output
|
|
|
|
Most paperlib commands support `--json` output for automation:
|
|
|
|
```bash
|
|
# Get library configuration
|
|
paperlib status --json
|
|
{
|
|
"success": true,
|
|
"timestamp": "2024-01-15T10:30:00.000Z",
|
|
"library_root": "/home/user/papers",
|
|
"config_path": "/home/user/papers/config/config.toml",
|
|
"database_path": "/home/user/papers/db/paperlib.sqlite3",
|
|
"papers_dir": "/home/user/papers/papers",
|
|
"inbox_dir": "/home/user/papers/inbox",
|
|
"cache_dir": "/home/user/papers/cache"
|
|
}
|
|
|
|
# List papers with metadata
|
|
paperlib list --json
|
|
{
|
|
"success": true,
|
|
"timestamp": "2024-01-15T10:30:00.000Z",
|
|
"papers": [
|
|
{
|
|
"paper_id": "arxiv-2212_06340",
|
|
"source_type": "arxiv",
|
|
"source_id": "2212.06340",
|
|
"title": "Example Paper",
|
|
"authors": ["Alice Smith", "Bob Jones"],
|
|
"published_date": "2022-12-06T00:00:00.000Z",
|
|
"categories": ["cs.AI"],
|
|
"conversion_status": "success",
|
|
"summary_status": "pending",
|
|
"imported_at": "2024-01-15T10:30:00.000Z",
|
|
"tags": [],
|
|
"notes": ""
|
|
}
|
|
],
|
|
"total": 1
|
|
}
|
|
|
|
# Import with JSON response
|
|
paperlib import --arxiv 2212.06340 --json
|
|
{
|
|
"success": true,
|
|
"timestamp": "2024-01-15T10:30:00.000Z",
|
|
"paper_id": "arxiv-2212_06340",
|
|
"title": "Example Paper Title",
|
|
"source_type": "arxiv",
|
|
"source_id": "2212.06340",
|
|
"authors": ["Alice Smith", "Bob Jones"],
|
|
"message": "Successfully imported arXiv paper",
|
|
"paper": {
|
|
// Full paper metadata object
|
|
}
|
|
}
|
|
}
|
|
|
|
# Convert papers with JSON output
|
|
paperlib convert --json
|
|
{
|
|
"success": true,
|
|
"timestamp": "2024-01-15T10:30:00.000Z",
|
|
"action": "convert_pending",
|
|
"success_count": 5,
|
|
"failure_count": 1,
|
|
"total_attempted": 6
|
|
}
|
|
|
|
# Reindex with JSON output
|
|
paperlib reindex --json
|
|
{
|
|
"success": true,
|
|
"timestamp": "2024-01-15T10:30:00.000Z",
|
|
"reindex_complete": true,
|
|
"papers_indexed": 42,
|
|
"errors": 1,
|
|
"statistics": {
|
|
"total_papers": 42,
|
|
"by_source_type": {
|
|
"arxiv": 38,
|
|
"local": 4
|
|
}
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### Exit Codes
|
|
|
|
paperlib commands follow standard Unix exit code conventions:
|
|
|
|
```bash
|
|
paperlib import --arxiv 2212.06340
|
|
echo $? # 0 for success, 1 for error
|
|
|
|
# Check if paper exists before processing
|
|
if paperlib show "$paper_id" --json >/dev/null 2>&1; then
|
|
echo "Paper exists"
|
|
else
|
|
echo "Paper not found"
|
|
fi
|
|
```
|
|
|
|
### Scripting Examples
|
|
|
|
#### Daily arXiv Import
|
|
|
|
```bash
|
|
#!/bin/bash
|
|
# daily-arxiv.sh - Import papers from daily arXiv feed
|
|
|
|
LIBRARY="$HOME/research"
|
|
ARXIV_FEED_URL="http://export.arxiv.org/rss/cs.AI"
|
|
|
|
# Parse RSS feed and extract arXiv IDs
|
|
curl -s "$ARXIV_FEED_URL" | \
|
|
grep -oP 'arxiv\.org/abs/\K[0-9]{4}\.[0-9]{4,5}' | \
|
|
while read arxiv_id; do
|
|
echo "Importing $arxiv_id..."
|
|
paperlib import --arxiv "$arxiv_id" --library "$LIBRARY" --json
|
|
done
|
|
|
|
# Convert newly imported papers with JSON output
|
|
paperlib convert --library "$LIBRARY" --json
|
|
|
|
# Generate daily report
|
|
paperlib list --library "$LIBRARY" --json | \
|
|
jq '.papers | map(select(.imported_at | startswith(now | strftime("%Y-%m-%d"))))'
|
|
```
|
|
|
|
#### Batch Processing
|
|
|
|
```bash
|
|
#!/bin/bash
|
|
# batch-process.sh - Process multiple papers from a list
|
|
|
|
LIBRARY="$HOME/research"
|
|
PAPER_LIST="papers.txt"
|
|
|
|
while IFS= read -r pdf_path; do
|
|
if [[ -f "$pdf_path" ]]; then
|
|
echo "Importing $pdf_path..."
|
|
result=$(paperlib import --pdf "$pdf_path" --library "$LIBRARY" --json)
|
|
|
|
if [[ $? -eq 0 ]]; then
|
|
paper_id=$(echo "$result" | jq -r '.paper_id')
|
|
echo "Successfully imported as $paper_id"
|
|
else
|
|
echo "Failed to import $pdf_path"
|
|
fi
|
|
fi
|
|
done < "$PAPER_LIST"
|
|
|
|
# Convert all pending papers with JSON output
|
|
paperlib convert --library "$LIBRARY" --json
|
|
```
|
|
|
|
## Python API
|
|
|
|
### Direct Library Access
|
|
|
|
```python
|
|
from paperlib.config import LibraryPaths
|
|
from paperlib.storage import PaperStorageManager
|
|
from paperlib.index import DatabaseManager
|
|
from paperlib.importer import ArxivImporter, LocalImporter
|
|
|
|
# Initialize library components
|
|
library_paths = LibraryPaths.from_root("/path/to/library")
|
|
storage = PaperStorageManager(library_paths)
|
|
database = DatabaseManager(library_paths)
|
|
database.initialize_database()
|
|
|
|
# Import paper programmatically
|
|
arxiv_importer = ArxivImporter(storage)
|
|
metadata = arxiv_importer.import_arxiv_paper("2212.06340")
|
|
database.index_paper(metadata)
|
|
|
|
# Search and retrieve
|
|
results = list(database.search_papers("neural networks"))
|
|
for result in results:
|
|
paper = storage.load_paper_metadata(result["paper_id"], result["source_type"])
|
|
print(f"{paper.title} by {', '.join(paper.authors)}")
|
|
|
|
# Get statistics
|
|
stats = database.get_statistics()
|
|
print(f"Total papers: {stats['total_papers']}")
|
|
```
|
|
|
|
### Metadata Processing
|
|
|
|
```python
|
|
import json
|
|
from pathlib import Path
|
|
from paperlib.models import PaperMetadata, PaperSummary
|
|
|
|
# Process all papers in library
|
|
papers_dir = Path("/home/user/papers/papers")
|
|
|
|
for meta_file in papers_dir.rglob("meta.json"):
|
|
# Load metadata
|
|
metadata = PaperMetadata.load_from_file(meta_file)
|
|
|
|
# Check for summary
|
|
summary_path = meta_file.parent / "summary.json"
|
|
if summary_path.exists():
|
|
summary = PaperSummary.load_from_file(summary_path)
|
|
|
|
# Extract key information
|
|
tags = summary.problem_tags + summary.technique_tags
|
|
entities = summary.entities
|
|
|
|
print(f"Paper: {metadata.title}")
|
|
print(f"Tags: {', '.join(tags)}")
|
|
print(f"Entities: {', '.join(entities)}")
|
|
```
|
|
|
|
## File System Integration
|
|
|
|
### Direct File Access
|
|
|
|
Since paperlib uses a documented file layout, tools can read data directly:
|
|
|
|
```python
|
|
import json
|
|
from pathlib import Path
|
|
|
|
def scan_library(library_root: Path):
|
|
"""Scan library and extract metadata."""
|
|
papers = []
|
|
|
|
for meta_file in library_root.glob("papers/**/meta.json"):
|
|
with meta_file.open() as f:
|
|
metadata = json.load(f)
|
|
papers.append(metadata)
|
|
|
|
return papers
|
|
|
|
def find_papers_by_category(library_root: Path, category: str):
|
|
"""Find papers in a specific category."""
|
|
matching_papers = []
|
|
|
|
for meta_file in library_root.glob("papers/**/meta.json"):
|
|
with meta_file.open() as f:
|
|
metadata = json.load(f)
|
|
|
|
if category in metadata.get("categories", []):
|
|
matching_papers.append(metadata)
|
|
|
|
return matching_papers
|
|
```
|
|
|
|
### Watch for Changes
|
|
|
|
```python
|
|
import time
|
|
from pathlib import Path
|
|
from watchdog.observers import Observer
|
|
from watchdog.events import FileSystemEventHandler
|
|
|
|
class PaperLibraryHandler(FileSystemEventHandler):
|
|
def __init__(self, library_root):
|
|
self.library_root = Path(library_root)
|
|
|
|
def on_created(self, event):
|
|
if event.src_path.endswith("meta.json"):
|
|
print(f"New paper imported: {event.src_path}")
|
|
# Trigger processing workflow
|
|
self.process_new_paper(event.src_path)
|
|
|
|
def on_modified(self, event):
|
|
if event.src_path.endswith("summary.json"):
|
|
print(f"Summary updated: {event.src_path}")
|
|
# Update downstream systems
|
|
|
|
def process_new_paper(self, meta_path):
|
|
"""Handle newly imported paper."""
|
|
# Load metadata
|
|
with open(meta_path) as f:
|
|
metadata = json.load(f)
|
|
|
|
# Trigger downstream processing
|
|
# - Send to processing queue
|
|
# - Update knowledge base
|
|
# - Generate notifications
|
|
|
|
# Watch library for changes
|
|
observer = Observer()
|
|
handler = PaperLibraryHandler("/home/user/papers")
|
|
observer.schedule(handler, "/home/user/papers/papers", recursive=True)
|
|
observer.start()
|
|
```
|
|
|
|
## Higher-Level Tool Examples
|
|
|
|
### Research Dashboard
|
|
|
|
```python
|
|
"""research_dashboard.py - Web dashboard for research library"""
|
|
|
|
from flask import Flask, jsonify, render_template
|
|
from paperlib.config import LibraryPaths
|
|
from paperlib.storage import PaperStorageManager
|
|
from paperlib.index import DatabaseManager
|
|
|
|
app = Flask(__name__)
|
|
|
|
# Initialize paperlib components
|
|
library_paths = LibraryPaths.from_root("/home/user/research")
|
|
storage = PaperStorageManager(library_paths)
|
|
database = DatabaseManager(library_paths)
|
|
|
|
@app.route('/api/papers')
|
|
def list_papers():
|
|
"""List all papers with metadata."""
|
|
papers = list(database.list_papers(limit=50))
|
|
return jsonify(papers)
|
|
|
|
@app.route('/api/search/<query>')
|
|
def search_papers(query):
|
|
"""Search papers by query."""
|
|
results = list(database.search_papers(query, limit=20))
|
|
return jsonify(results)
|
|
|
|
@app.route('/api/stats')
|
|
def library_stats():
|
|
"""Get library statistics."""
|
|
stats = database.get_statistics()
|
|
return jsonify(stats)
|
|
|
|
@app.route('/')
|
|
def dashboard():
|
|
"""Main dashboard page."""
|
|
return render_template('dashboard.html')
|
|
|
|
if __name__ == '__main__':
|
|
app.run(debug=True)
|
|
```
|
|
|
|
### Daily Digest Generator
|
|
|
|
```python
|
|
"""daily_digest.py - Generate daily research digest"""
|
|
|
|
import json
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from paperlib.config import LibraryPaths
|
|
from paperlib.index import DatabaseManager
|
|
|
|
def generate_daily_digest(library_root: str, output_file: str):
|
|
"""Generate digest of recently imported papers."""
|
|
|
|
# Initialize database
|
|
library_paths = LibraryPaths.from_root(library_root)
|
|
database = DatabaseManager(library_paths)
|
|
|
|
# Get papers from last 24 hours
|
|
yesterday = datetime.now() - timedelta(days=1)
|
|
yesterday_iso = yesterday.isoformat()
|
|
|
|
recent_papers = []
|
|
for paper in database.list_papers():
|
|
if paper["imported_at"] >= yesterday_iso:
|
|
recent_papers.append(paper)
|
|
|
|
if not recent_papers:
|
|
print("No new papers imported yesterday.")
|
|
return
|
|
|
|
# Group by category
|
|
by_category = {}
|
|
for paper in recent_papers:
|
|
categories = json.loads(paper["categories_json"])
|
|
for category in categories:
|
|
if category not in by_category:
|
|
by_category[category] = []
|
|
by_category[category].append(paper)
|
|
|
|
# Generate HTML digest
|
|
html_content = f"""
|
|
<html>
|
|
<head><title>Daily Research Digest - {datetime.now().strftime('%Y-%m-%d')}</title></head>
|
|
<body>
|
|
<h1>Daily Research Digest</h1>
|
|
<p>Found {len(recent_papers)} new papers</p>
|
|
"""
|
|
|
|
for category, papers in by_category.items():
|
|
html_content += f"<h2>{category}</h2><ul>"
|
|
for paper in papers:
|
|
title = paper["title"]
|
|
paper_id = paper["paper_id"]
|
|
html_content += f'<li><strong>{title}</strong> ({paper_id})</li>'
|
|
html_content += "</ul>"
|
|
|
|
html_content += "</body></html>"
|
|
|
|
# Write output
|
|
Path(output_file).write_text(html_content)
|
|
print(f"Digest written to {output_file}")
|
|
|
|
if __name__ == "__main__":
|
|
generate_daily_digest("/home/user/research", "digest.html")
|
|
```
|
|
|
|
### Literature Review Assistant
|
|
|
|
```python
|
|
"""review_assistant.py - AI-powered literature review helper"""
|
|
|
|
from paperlib.config import LibraryPaths
|
|
from paperlib.index import DatabaseManager
|
|
from paperlib.models import PaperSummary
|
|
|
|
class ReviewAssistant:
|
|
def __init__(self, library_root: str):
|
|
self.library_paths = LibraryPaths.from_root(library_root)
|
|
self.database = DatabaseManager(self.library_paths)
|
|
|
|
def find_related_papers(self, paper_id: str, max_results: int = 10):
|
|
"""Find papers related to the given paper."""
|
|
|
|
# Get source paper metadata
|
|
source_paper = self.database.get_paper(paper_id)
|
|
if not source_paper:
|
|
return []
|
|
|
|
# Extract search terms from title and categories
|
|
title_words = source_paper["title"].lower().split()
|
|
categories = json.loads(source_paper["categories_json"])
|
|
|
|
# Search for papers with similar keywords
|
|
search_terms = title_words + categories
|
|
related_papers = []
|
|
|
|
for term in search_terms:
|
|
results = list(self.database.search_papers(term, limit=5))
|
|
for result in results:
|
|
if result["paper_id"] != paper_id:
|
|
related_papers.append(result)
|
|
|
|
# Remove duplicates and return top results
|
|
seen_ids = set()
|
|
unique_papers = []
|
|
for paper in related_papers:
|
|
if paper["paper_id"] not in seen_ids:
|
|
seen_ids.add(paper["paper_id"])
|
|
unique_papers.append(paper)
|
|
if len(unique_papers) >= max_results:
|
|
break
|
|
|
|
return unique_papers
|
|
|
|
def generate_topic_overview(self, topic: str):
|
|
"""Generate overview of papers on a specific topic."""
|
|
|
|
# Search for papers on topic
|
|
papers = list(self.database.search_papers(topic, limit=50))
|
|
|
|
if not papers:
|
|
return f"No papers found for topic: {topic}"
|
|
|
|
# Analyze summaries if available
|
|
key_entities = set()
|
|
techniques = set()
|
|
|
|
for paper in papers:
|
|
summary_path = Path(paper["summary_json_path"])
|
|
if summary_path.exists():
|
|
summary = PaperSummary.load_from_file(summary_path)
|
|
key_entities.update(summary.entities)
|
|
techniques.update(summary.technique_tags)
|
|
|
|
# Generate overview
|
|
overview = f"""
|
|
Topic: {topic}
|
|
|
|
Papers found: {len(papers)}
|
|
|
|
Key entities mentioned:
|
|
{', '.join(sorted(key_entities)[:10])}
|
|
|
|
Common techniques:
|
|
{', '.join(sorted(techniques)[:10])}
|
|
|
|
Recent papers:
|
|
"""
|
|
|
|
# Add recent papers
|
|
recent_papers = sorted(papers, key=lambda x: x["imported_at"], reverse=True)[:5]
|
|
for paper in recent_papers:
|
|
overview += f"\n- {paper['title']} ({paper['paper_id']})"
|
|
|
|
return overview
|
|
|
|
# Usage
|
|
assistant = ReviewAssistant("/home/user/research")
|
|
overview = assistant.generate_topic_overview("transformer architecture")
|
|
print(overview)
|
|
```
|
|
|
|
## Integration Patterns
|
|
|
|
### Pipeline Processing
|
|
|
|
```bash
|
|
# Multi-stage processing pipeline
|
|
paperlib import --arxiv 2212.06340 --json > import_result.json
|
|
paper_id=$(jq -r '.paper_id' import_result.json)
|
|
|
|
# Convert to markdown
|
|
paperlib convert --paper-id "$paper_id"
|
|
|
|
# Generate summary (when available)
|
|
# paperlib summarize --paper-id "$paper_id"
|
|
|
|
# Update downstream systems
|
|
curl -X POST "http://research-db/api/papers" \
|
|
-H "Content-Type: application/json" \
|
|
-d @import_result.json
|
|
```
|
|
|
|
### Event-Driven Architecture
|
|
|
|
```python
|
|
"""event_handler.py - Process paperlib events"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
import pika # RabbitMQ client
|
|
|
|
class PaperLibraryEventHandler:
|
|
def __init__(self, rabbitmq_url: str):
|
|
self.connection = pika.BlockingConnection(pika.URLParameters(rabbitmq_url))
|
|
self.channel = self.connection.channel()
|
|
|
|
def on_paper_imported(self, paper_metadata: dict):
|
|
"""Handle new paper import."""
|
|
message = {
|
|
"event": "paper_imported",
|
|
"paper_id": paper_metadata["paper_id"],
|
|
"title": paper_metadata["title"],
|
|
"categories": paper_metadata["categories"],
|
|
"timestamp": paper_metadata["imported_at"]
|
|
}
|
|
|
|
# Send to processing queue
|
|
self.channel.basic_publish(
|
|
exchange='',
|
|
routing_key='paper_processing',
|
|
body=json.dumps(message)
|
|
)
|
|
|
|
def on_summary_generated(self, paper_id: str, summary_path: Path):
|
|
"""Handle summary generation."""
|
|
with summary_path.open() as f:
|
|
summary = json.load(f)
|
|
|
|
message = {
|
|
"event": "summary_generated",
|
|
"paper_id": paper_id,
|
|
"tags": summary["problem_tags"] + summary["technique_tags"],
|
|
"entities": summary["entities"]
|
|
}
|
|
|
|
# Send to indexing service
|
|
self.channel.basic_publish(
|
|
exchange='',
|
|
routing_key='summary_indexing',
|
|
body=json.dumps(message)
|
|
)
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
### Error Handling
|
|
|
|
```python
|
|
import subprocess
|
|
import json
|
|
|
|
def safe_paperlib_command(command: list[str]) -> dict:
|
|
"""Execute paperlib command with proper error handling."""
|
|
try:
|
|
result = subprocess.run(
|
|
["paperlib"] + command + ["--json"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True
|
|
)
|
|
return json.loads(result.stdout)
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
return {
|
|
"success": False,
|
|
"error": e.stderr,
|
|
"exit_code": e.returncode
|
|
}
|
|
|
|
except json.JSONDecodeError as e:
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid JSON response: {e}",
|
|
"raw_output": result.stdout
|
|
}
|
|
|
|
# Usage
|
|
result = safe_paperlib_command(["import", "--arxiv", "2212.06340"])
|
|
if result.get("success", True): # Assume success if no "success" field
|
|
print(f"Imported paper: {result['paper_id']}")
|
|
else:
|
|
print(f"Import failed: {result['error']}")
|
|
```
|
|
|
|
### Performance Optimization
|
|
|
|
```python
|
|
# Batch operations for better performance
|
|
from paperlib.index import DatabaseManager
|
|
|
|
def batch_index_papers(library_root: str, paper_ids: list[str]):
|
|
"""Index multiple papers efficiently."""
|
|
database = DatabaseManager(LibraryPaths.from_root(library_root))
|
|
storage = PaperStorageManager(LibraryPaths.from_root(library_root))
|
|
|
|
# Begin transaction for batch insert
|
|
with database._get_connection() as conn:
|
|
for paper_id in paper_ids:
|
|
metadata = storage.load_paper_metadata(paper_id, source_type)
|
|
if metadata:
|
|
database.index_paper(metadata)
|
|
# Automatic commit on context exit
|
|
```
|
|
|
|
### Configuration Management
|
|
|
|
```python
|
|
# config_manager.py - Centralized configuration
|
|
import os
|
|
from pathlib import Path
|
|
|
|
class ConfigManager:
|
|
def __init__(self):
|
|
self.library_root = os.getenv("PAPERLIB_ROOT", Path.home() / "research")
|
|
self.api_keys = {
|
|
"openai": os.getenv("OPENAI_API_KEY"),
|
|
"anthropic": os.getenv("ANTHROPIC_API_KEY")
|
|
}
|
|
|
|
def get_library_path(self, name: str = "default") -> str:
|
|
"""Get library path by name."""
|
|
if name == "default":
|
|
return str(self.library_root)
|
|
return str(Path.home() / f"research-{name}")
|
|
|
|
def paperlib_command_base(self, library_name: str = "default") -> list[str]:
|
|
"""Get base command for paperlib with library."""
|
|
return ["paperlib", "--library", self.get_library_path(library_name)]
|
|
|
|
config = ConfigManager()
|
|
|
|
# Usage in scripts
|
|
import subprocess
|
|
cmd = config.paperlib_command_base("arxiv") + ["list", "--json"]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
```
|
|
|
|
This integration guide provides the foundation for building sophisticated research workflows on top of paperlib's stable, local-first architecture. |