docs: add docs
This commit is contained in:
@@ -0,0 +1,638 @@
|
||||
# Integration Guide
|
||||
|
||||
This document describes how to integrate paperlib with higher-level tools and automation workflows.
|
||||
|
||||
## Overview
|
||||
|
||||
paperlib is designed as a **library engine** that higher-level tools can build upon. It provides:
|
||||
|
||||
- **Stable CLI interface** with machine-readable JSON output
|
||||
- **File-based storage** that external tools can read directly
|
||||
- **Python API** for programmatic access
|
||||
- **Event hooks** for workflow integration (future)
|
||||
|
||||
## CLI Integration
|
||||
|
||||
### Machine-Readable Output
|
||||
|
||||
Most paperlib commands support `--json` output for automation:
|
||||
|
||||
```bash
|
||||
# Get library statistics
|
||||
paperlib status --json
|
||||
{
|
||||
"library_root": "/home/user/papers",
|
||||
"total_papers": 42,
|
||||
"by_status": {"converted": 38, "pending": 4},
|
||||
"last_updated": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
|
||||
# List papers with metadata
|
||||
paperlib list --json
|
||||
{
|
||||
"papers": [
|
||||
{
|
||||
"paper_id": "arxiv-2212_06340",
|
||||
"title": "Example Paper",
|
||||
"authors": ["Alice Smith", "Bob Jones"],
|
||||
"categories": ["cs.AI"],
|
||||
"conversion_status": "success",
|
||||
"summary_status": "pending",
|
||||
"imported_at": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
],
|
||||
"total": 1
|
||||
}
|
||||
|
||||
# Import with JSON response
|
||||
paperlib import --arxiv 2212.06340 --json
|
||||
{
|
||||
"success": true,
|
||||
"paper_id": "arxiv-2212_06340",
|
||||
"title": "Example Paper Title",
|
||||
"message": "Successfully imported arXiv paper"
|
||||
}
|
||||
```
|
||||
|
||||
### Exit Codes
|
||||
|
||||
paperlib commands follow standard Unix exit code conventions:
|
||||
|
||||
```bash
|
||||
paperlib import --arxiv 2212.06340
|
||||
echo $? # 0 for success, 1 for error
|
||||
|
||||
# Check if paper exists before processing
|
||||
if paperlib show "$paper_id" --json >/dev/null 2>&1; then
|
||||
echo "Paper exists"
|
||||
else
|
||||
echo "Paper not found"
|
||||
fi
|
||||
```
|
||||
|
||||
### Scripting Examples
|
||||
|
||||
#### Daily arXiv Import
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# daily-arxiv.sh - Import papers from daily arXiv feed
|
||||
|
||||
LIBRARY="$HOME/research"
|
||||
ARXIV_FEED_URL="http://export.arxiv.org/rss/cs.AI"
|
||||
|
||||
# Parse RSS feed and extract arXiv IDs
|
||||
curl -s "$ARXIV_FEED_URL" | \
|
||||
grep -oP 'arxiv\.org/abs/\K[0-9]{4}\.[0-9]{4,5}' | \
|
||||
while read arxiv_id; do
|
||||
echo "Importing $arxiv_id..."
|
||||
paperlib import --arxiv "$arxiv_id" --library "$LIBRARY" --json
|
||||
done
|
||||
|
||||
# Convert newly imported papers
|
||||
paperlib convert --library "$LIBRARY"
|
||||
|
||||
# Generate daily report
|
||||
paperlib list --library "$LIBRARY" --json | \
|
||||
jq '.papers | map(select(.imported_at | startswith(now | strftime("%Y-%m-%d"))))'
|
||||
```
|
||||
|
||||
#### Batch Processing
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# batch-process.sh - Process multiple papers from a list
|
||||
|
||||
LIBRARY="$HOME/research"
|
||||
PAPER_LIST="papers.txt"
|
||||
|
||||
while IFS= read -r pdf_path; do
|
||||
if [[ -f "$pdf_path" ]]; then
|
||||
echo "Importing $pdf_path..."
|
||||
result=$(paperlib import --pdf "$pdf_path" --library "$LIBRARY" --json)
|
||||
|
||||
if [[ $? -eq 0 ]]; then
|
||||
paper_id=$(echo "$result" | jq -r '.paper_id')
|
||||
echo "Successfully imported as $paper_id"
|
||||
else
|
||||
echo "Failed to import $pdf_path"
|
||||
fi
|
||||
fi
|
||||
done < "$PAPER_LIST"
|
||||
|
||||
# Convert all pending papers
|
||||
paperlib convert --library "$LIBRARY"
|
||||
```
|
||||
|
||||
## Python API
|
||||
|
||||
### Direct Library Access
|
||||
|
||||
```python
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.storage import PaperStorageManager
|
||||
from paperlib.index import DatabaseManager
|
||||
from paperlib.importer import ArxivImporter, LocalImporter
|
||||
|
||||
# Initialize library components
|
||||
library_paths = LibraryPaths.from_root("/path/to/library")
|
||||
storage = PaperStorageManager(library_paths)
|
||||
database = DatabaseManager(library_paths)
|
||||
database.initialize_database()
|
||||
|
||||
# Import paper programmatically
|
||||
arxiv_importer = ArxivImporter(storage)
|
||||
metadata = arxiv_importer.import_arxiv_paper("2212.06340")
|
||||
database.index_paper(metadata)
|
||||
|
||||
# Search and retrieve
|
||||
results = list(database.search_papers("neural networks"))
|
||||
for result in results:
|
||||
paper = storage.load_paper_metadata(result["paper_id"], result["source_type"])
|
||||
print(f"{paper.title} by {', '.join(paper.authors)}")
|
||||
|
||||
# Get statistics
|
||||
stats = database.get_statistics()
|
||||
print(f"Total papers: {stats['total_papers']}")
|
||||
```
|
||||
|
||||
### Metadata Processing
|
||||
|
||||
```python
|
||||
import json
|
||||
from pathlib import Path
|
||||
from paperlib.models import PaperMetadata, PaperSummary
|
||||
|
||||
# Process all papers in library
|
||||
papers_dir = Path("/home/user/papers/papers")
|
||||
|
||||
for meta_file in papers_dir.rglob("meta.json"):
|
||||
# Load metadata
|
||||
metadata = PaperMetadata.load_from_file(meta_file)
|
||||
|
||||
# Check for summary
|
||||
summary_path = meta_file.parent / "summary.json"
|
||||
if summary_path.exists():
|
||||
summary = PaperSummary.load_from_file(summary_path)
|
||||
|
||||
# Extract key information
|
||||
tags = summary.problem_tags + summary.technique_tags
|
||||
entities = summary.entities
|
||||
|
||||
print(f"Paper: {metadata.title}")
|
||||
print(f"Tags: {', '.join(tags)}")
|
||||
print(f"Entities: {', '.join(entities)}")
|
||||
```
|
||||
|
||||
## File System Integration
|
||||
|
||||
### Direct File Access
|
||||
|
||||
Since paperlib uses a documented file layout, tools can read data directly:
|
||||
|
||||
```python
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def scan_library(library_root: Path):
|
||||
"""Scan library and extract metadata."""
|
||||
papers = []
|
||||
|
||||
for meta_file in library_root.glob("papers/**/meta.json"):
|
||||
with meta_file.open() as f:
|
||||
metadata = json.load(f)
|
||||
papers.append(metadata)
|
||||
|
||||
return papers
|
||||
|
||||
def find_papers_by_category(library_root: Path, category: str):
|
||||
"""Find papers in a specific category."""
|
||||
matching_papers = []
|
||||
|
||||
for meta_file in library_root.glob("papers/**/meta.json"):
|
||||
with meta_file.open() as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
if category in metadata.get("categories", []):
|
||||
matching_papers.append(metadata)
|
||||
|
||||
return matching_papers
|
||||
```
|
||||
|
||||
### Watch for Changes
|
||||
|
||||
```python
|
||||
import time
|
||||
from pathlib import Path
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
|
||||
class PaperLibraryHandler(FileSystemEventHandler):
|
||||
def __init__(self, library_root):
|
||||
self.library_root = Path(library_root)
|
||||
|
||||
def on_created(self, event):
|
||||
if event.src_path.endswith("meta.json"):
|
||||
print(f"New paper imported: {event.src_path}")
|
||||
# Trigger processing workflow
|
||||
self.process_new_paper(event.src_path)
|
||||
|
||||
def on_modified(self, event):
|
||||
if event.src_path.endswith("summary.json"):
|
||||
print(f"Summary updated: {event.src_path}")
|
||||
# Update downstream systems
|
||||
|
||||
def process_new_paper(self, meta_path):
|
||||
"""Handle newly imported paper."""
|
||||
# Load metadata
|
||||
with open(meta_path) as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
# Trigger downstream processing
|
||||
# - Send to processing queue
|
||||
# - Update knowledge base
|
||||
# - Generate notifications
|
||||
|
||||
# Watch library for changes
|
||||
observer = Observer()
|
||||
handler = PaperLibraryHandler("/home/user/papers")
|
||||
observer.schedule(handler, "/home/user/papers/papers", recursive=True)
|
||||
observer.start()
|
||||
```
|
||||
|
||||
## Higher-Level Tool Examples
|
||||
|
||||
### Research Dashboard
|
||||
|
||||
```python
|
||||
"""research_dashboard.py - Web dashboard for research library"""
|
||||
|
||||
from flask import Flask, jsonify, render_template
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.storage import PaperStorageManager
|
||||
from paperlib.index import DatabaseManager
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Initialize paperlib components
|
||||
library_paths = LibraryPaths.from_root("/home/user/research")
|
||||
storage = PaperStorageManager(library_paths)
|
||||
database = DatabaseManager(library_paths)
|
||||
|
||||
@app.route('/api/papers')
|
||||
def list_papers():
|
||||
"""List all papers with metadata."""
|
||||
papers = list(database.list_papers(limit=50))
|
||||
return jsonify(papers)
|
||||
|
||||
@app.route('/api/search/<query>')
|
||||
def search_papers(query):
|
||||
"""Search papers by query."""
|
||||
results = list(database.search_papers(query, limit=20))
|
||||
return jsonify(results)
|
||||
|
||||
@app.route('/api/stats')
|
||||
def library_stats():
|
||||
"""Get library statistics."""
|
||||
stats = database.get_statistics()
|
||||
return jsonify(stats)
|
||||
|
||||
@app.route('/')
|
||||
def dashboard():
|
||||
"""Main dashboard page."""
|
||||
return render_template('dashboard.html')
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
```
|
||||
|
||||
### Daily Digest Generator
|
||||
|
||||
```python
|
||||
"""daily_digest.py - Generate daily research digest"""
|
||||
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.index import DatabaseManager
|
||||
|
||||
def generate_daily_digest(library_root: str, output_file: str):
|
||||
"""Generate digest of recently imported papers."""
|
||||
|
||||
# Initialize database
|
||||
library_paths = LibraryPaths.from_root(library_root)
|
||||
database = DatabaseManager(library_paths)
|
||||
|
||||
# Get papers from last 24 hours
|
||||
yesterday = datetime.now() - timedelta(days=1)
|
||||
yesterday_iso = yesterday.isoformat()
|
||||
|
||||
recent_papers = []
|
||||
for paper in database.list_papers():
|
||||
if paper["imported_at"] >= yesterday_iso:
|
||||
recent_papers.append(paper)
|
||||
|
||||
if not recent_papers:
|
||||
print("No new papers imported yesterday.")
|
||||
return
|
||||
|
||||
# Group by category
|
||||
by_category = {}
|
||||
for paper in recent_papers:
|
||||
categories = json.loads(paper["categories_json"])
|
||||
for category in categories:
|
||||
if category not in by_category:
|
||||
by_category[category] = []
|
||||
by_category[category].append(paper)
|
||||
|
||||
# Generate HTML digest
|
||||
html_content = f"""
|
||||
<html>
|
||||
<head><title>Daily Research Digest - {datetime.now().strftime('%Y-%m-%d')}</title></head>
|
||||
<body>
|
||||
<h1>Daily Research Digest</h1>
|
||||
<p>Found {len(recent_papers)} new papers</p>
|
||||
"""
|
||||
|
||||
for category, papers in by_category.items():
|
||||
html_content += f"<h2>{category}</h2><ul>"
|
||||
for paper in papers:
|
||||
title = paper["title"]
|
||||
paper_id = paper["paper_id"]
|
||||
html_content += f'<li><strong>{title}</strong> ({paper_id})</li>'
|
||||
html_content += "</ul>"
|
||||
|
||||
html_content += "</body></html>"
|
||||
|
||||
# Write output
|
||||
Path(output_file).write_text(html_content)
|
||||
print(f"Digest written to {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_daily_digest("/home/user/research", "digest.html")
|
||||
```
|
||||
|
||||
### Literature Review Assistant
|
||||
|
||||
```python
|
||||
"""review_assistant.py - AI-powered literature review helper"""
|
||||
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.index import DatabaseManager
|
||||
from paperlib.models import PaperSummary
|
||||
|
||||
class ReviewAssistant:
|
||||
def __init__(self, library_root: str):
|
||||
self.library_paths = LibraryPaths.from_root(library_root)
|
||||
self.database = DatabaseManager(self.library_paths)
|
||||
|
||||
def find_related_papers(self, paper_id: str, max_results: int = 10):
|
||||
"""Find papers related to the given paper."""
|
||||
|
||||
# Get source paper metadata
|
||||
source_paper = self.database.get_paper(paper_id)
|
||||
if not source_paper:
|
||||
return []
|
||||
|
||||
# Extract search terms from title and categories
|
||||
title_words = source_paper["title"].lower().split()
|
||||
categories = json.loads(source_paper["categories_json"])
|
||||
|
||||
# Search for papers with similar keywords
|
||||
search_terms = title_words + categories
|
||||
related_papers = []
|
||||
|
||||
for term in search_terms:
|
||||
results = list(self.database.search_papers(term, limit=5))
|
||||
for result in results:
|
||||
if result["paper_id"] != paper_id:
|
||||
related_papers.append(result)
|
||||
|
||||
# Remove duplicates and return top results
|
||||
seen_ids = set()
|
||||
unique_papers = []
|
||||
for paper in related_papers:
|
||||
if paper["paper_id"] not in seen_ids:
|
||||
seen_ids.add(paper["paper_id"])
|
||||
unique_papers.append(paper)
|
||||
if len(unique_papers) >= max_results:
|
||||
break
|
||||
|
||||
return unique_papers
|
||||
|
||||
def generate_topic_overview(self, topic: str):
|
||||
"""Generate overview of papers on a specific topic."""
|
||||
|
||||
# Search for papers on topic
|
||||
papers = list(self.database.search_papers(topic, limit=50))
|
||||
|
||||
if not papers:
|
||||
return f"No papers found for topic: {topic}"
|
||||
|
||||
# Analyze summaries if available
|
||||
key_entities = set()
|
||||
techniques = set()
|
||||
|
||||
for paper in papers:
|
||||
summary_path = Path(paper["summary_json_path"])
|
||||
if summary_path.exists():
|
||||
summary = PaperSummary.load_from_file(summary_path)
|
||||
key_entities.update(summary.entities)
|
||||
techniques.update(summary.technique_tags)
|
||||
|
||||
# Generate overview
|
||||
overview = f"""
|
||||
Topic: {topic}
|
||||
|
||||
Papers found: {len(papers)}
|
||||
|
||||
Key entities mentioned:
|
||||
{', '.join(sorted(key_entities)[:10])}
|
||||
|
||||
Common techniques:
|
||||
{', '.join(sorted(techniques)[:10])}
|
||||
|
||||
Recent papers:
|
||||
"""
|
||||
|
||||
# Add recent papers
|
||||
recent_papers = sorted(papers, key=lambda x: x["imported_at"], reverse=True)[:5]
|
||||
for paper in recent_papers:
|
||||
overview += f"\n- {paper['title']} ({paper['paper_id']})"
|
||||
|
||||
return overview
|
||||
|
||||
# Usage
|
||||
assistant = ReviewAssistant("/home/user/research")
|
||||
overview = assistant.generate_topic_overview("transformer architecture")
|
||||
print(overview)
|
||||
```
|
||||
|
||||
## Integration Patterns
|
||||
|
||||
### Pipeline Processing
|
||||
|
||||
```bash
|
||||
# Multi-stage processing pipeline
|
||||
paperlib import --arxiv 2212.06340 --json > import_result.json
|
||||
paper_id=$(jq -r '.paper_id' import_result.json)
|
||||
|
||||
# Convert to markdown
|
||||
paperlib convert --paper-id "$paper_id"
|
||||
|
||||
# Generate summary (when available)
|
||||
# paperlib summarize --paper-id "$paper_id"
|
||||
|
||||
# Update downstream systems
|
||||
curl -X POST "http://research-db/api/papers" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @import_result.json
|
||||
```
|
||||
|
||||
### Event-Driven Architecture
|
||||
|
||||
```python
|
||||
"""event_handler.py - Process paperlib events"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
import pika # RabbitMQ client
|
||||
|
||||
class PaperLibraryEventHandler:
|
||||
def __init__(self, rabbitmq_url: str):
|
||||
self.connection = pika.BlockingConnection(pika.URLParameters(rabbitmq_url))
|
||||
self.channel = self.connection.channel()
|
||||
|
||||
def on_paper_imported(self, paper_metadata: dict):
|
||||
"""Handle new paper import."""
|
||||
message = {
|
||||
"event": "paper_imported",
|
||||
"paper_id": paper_metadata["paper_id"],
|
||||
"title": paper_metadata["title"],
|
||||
"categories": paper_metadata["categories"],
|
||||
"timestamp": paper_metadata["imported_at"]
|
||||
}
|
||||
|
||||
# Send to processing queue
|
||||
self.channel.basic_publish(
|
||||
exchange='',
|
||||
routing_key='paper_processing',
|
||||
body=json.dumps(message)
|
||||
)
|
||||
|
||||
def on_summary_generated(self, paper_id: str, summary_path: Path):
|
||||
"""Handle summary generation."""
|
||||
with summary_path.open() as f:
|
||||
summary = json.load(f)
|
||||
|
||||
message = {
|
||||
"event": "summary_generated",
|
||||
"paper_id": paper_id,
|
||||
"tags": summary["problem_tags"] + summary["technique_tags"],
|
||||
"entities": summary["entities"]
|
||||
}
|
||||
|
||||
# Send to indexing service
|
||||
self.channel.basic_publish(
|
||||
exchange='',
|
||||
routing_key='summary_indexing',
|
||||
body=json.dumps(message)
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
def safe_paperlib_command(command: list[str]) -> dict:
|
||||
"""Execute paperlib command with proper error handling."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["paperlib"] + command + ["--json"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
return json.loads(result.stdout)
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.stderr,
|
||||
"exit_code": e.returncode
|
||||
}
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Invalid JSON response: {e}",
|
||||
"raw_output": result.stdout
|
||||
}
|
||||
|
||||
# Usage
|
||||
result = safe_paperlib_command(["import", "--arxiv", "2212.06340"])
|
||||
if result.get("success", True): # Assume success if no "success" field
|
||||
print(f"Imported paper: {result['paper_id']}")
|
||||
else:
|
||||
print(f"Import failed: {result['error']}")
|
||||
```
|
||||
|
||||
### Performance Optimization
|
||||
|
||||
```python
|
||||
# Batch operations for better performance
|
||||
from paperlib.index import DatabaseManager
|
||||
|
||||
def batch_index_papers(library_root: str, paper_ids: list[str]):
|
||||
"""Index multiple papers efficiently."""
|
||||
database = DatabaseManager(LibraryPaths.from_root(library_root))
|
||||
storage = PaperStorageManager(LibraryPaths.from_root(library_root))
|
||||
|
||||
# Begin transaction for batch insert
|
||||
with database._get_connection() as conn:
|
||||
for paper_id in paper_ids:
|
||||
metadata = storage.load_paper_metadata(paper_id, source_type)
|
||||
if metadata:
|
||||
database.index_paper(metadata)
|
||||
# Automatic commit on context exit
|
||||
```
|
||||
|
||||
### Configuration Management
|
||||
|
||||
```python
|
||||
# config_manager.py - Centralized configuration
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
class ConfigManager:
|
||||
def __init__(self):
|
||||
self.library_root = os.getenv("PAPERLIB_ROOT", Path.home() / "research")
|
||||
self.api_keys = {
|
||||
"openai": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic": os.getenv("ANTHROPIC_API_KEY")
|
||||
}
|
||||
|
||||
def get_library_path(self, name: str = "default") -> str:
|
||||
"""Get library path by name."""
|
||||
if name == "default":
|
||||
return str(self.library_root)
|
||||
return str(Path.home() / f"research-{name}")
|
||||
|
||||
def paperlib_command_base(self, library_name: str = "default") -> list[str]:
|
||||
"""Get base command for paperlib with library."""
|
||||
return ["paperlib", "--library", self.get_library_path(library_name)]
|
||||
|
||||
config = ConfigManager()
|
||||
|
||||
# Usage in scripts
|
||||
import subprocess
|
||||
cmd = config.paperlib_command_base("arxiv") + ["list", "--json"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
```
|
||||
|
||||
This integration guide provides the foundation for building sophisticated research workflows on top of paperlib's stable, local-first architecture.
|
||||
Reference in New Issue
Block a user