Command Reference¶
Quick reference for common PBI operations and useful commands.
Pipeline Execution¶
Basic Execution¶
# Run full pipeline (first run: use 2-4 cores due to I/O)
snakemake --directory workflow --snakefile workflow/Snakefile \
--cores 4 --use-conda --printshellcmds
# Run with caching (recommended for development)
snakemake --directory workflow --snakefile workflow/Snakefile \
--cache --cores 4 --use-conda --printshellcmds
# Keep temporary files (useful for debugging)
snakemake --directory workflow --snakefile workflow/Snakefile \
--notemp --cores 4 --use-conda --printshellcmds
# Dry run (see what would execute)
snakemake --directory workflow --snakefile workflow/Snakefile -n
# Use all available cores
snakemake --directory workflow --snakefile workflow/Snakefile \
--cores all --use-conda
Specific Targets¶
# Create database only
snakemake --directory workflow --snakefile workflow/Snakefile \
--cores 4 --use-conda \
../data/databases/phage_database.duckdb
# Create optimized database
snakemake --directory workflow --snakefile workflow/Snakefile \
--cores 4 --use-conda \
../data/databases/phage_database_optimized.duckdb
# Generate validation reports only
snakemake --directory workflow --snakefile workflow/Snakefile \
--cores 4 --use-conda \
reports/database_validation.html
Workflow Visualization¶
# Generate DAG (Directed Acyclic Graph) visualization
cd workflow
snakemake --dag | dot -Tsvg > dag/workflow.svg
# Generate rulegraph (simplified)
snakemake --rulegraph | dot -Tsvg > dag/rulegraph.svg
# View in browser
xdg-open dag/workflow.svg # Linux
open dag/workflow.svg # macOS
Cleanup¶
# Remove temporary files after execution
snakemake --delete-temp-output
# Clean all generated files (CAREFUL!)
snakemake --directory workflow --snakefile workflow/Snakefile --delete-all-output
# Remove conda environments
rm -rf workflow/.snakemake/conda/
# Remove Snakemake metadata
rm -rf workflow/.snakemake/
Docker Commands¶
Pipeline¶
# Build pipeline image
docker compose build pipeline
# Run pipeline
docker compose run --rm pipeline
# View pipeline logs
docker compose logs pipeline
# Run pipeline with custom cores
docker compose run --rm pipeline snakemake --cores 2 --use-conda
API¶
# Build API image
docker compose build api
# Start API (detached)
docker compose up -d api
# Start API (foreground with logs)
docker compose up api
# View API logs
docker compose logs -f api
# Stop API
docker compose down
# Restart API
docker compose restart api
Data Access¶
# List database files
docker run --rm -v pbi-data:/data alpine ls -lh /data/processed/databases/
# List reports
docker run --rm -v pbi-data:/data alpine ls -lh /data/processed/reports/
# Copy database to host
docker run --rm -v pbi-data:/data -v $(pwd):/backup alpine \
cp /data/processed/databases/phage_database_optimized.duckdb /backup/
# Copy reports to host
docker run --rm -v pbi-data:/data -v $(pwd):/backup alpine \
cp -r /data/processed/reports /backup/
# Copy specific report
docker run --rm -v pbi-data:/data -v $(pwd):/backup alpine \
cp /data/processed/reports/database_validation.html /backup/
Volume Management¶
# List volumes
docker volume ls
# Inspect volume
docker volume inspect pbi-data
# Remove cache volume (keeps data)
docker compose down
docker volume rm pbi-cache
# Remove all volumes (CAREFUL - deletes all data!)
docker compose down -v
# Clean up using provided script
./cleanup_cache.sh
Troubleshooting¶
# Check running containers
docker ps
# Check all containers (including stopped)
docker ps -a
# View container resource usage
docker stats
# Clean up Docker system
docker system prune -a --volumes
# Rebuild without cache
docker compose build --no-cache pipeline
docker compose build --no-cache api
Database Operations¶
DuckDB CLI¶
# Connect to database
duckdb data/databases/phage_database_optimized.duckdb
# Inside DuckDB:
# Show all tables
.tables
# Describe table schema
DESCRIBE fact_phages;
# Show table row count
SELECT COUNT(*) FROM fact_phages;
# Exit
.quit
Python/DuckDB¶
import duckdb
# Connect to database
conn = duckdb.connect('data/databases/phage_database_optimized.duckdb')
# Simple query
result = conn.execute("SELECT COUNT(*) FROM fact_phages").fetchone()
print(f"Total phages: {result[0]:,}")
# Query to DataFrame
df = conn.execute("""
SELECT Source_DB, COUNT(*) as count
FROM fact_phages
GROUP BY Source_DB
""").df()
print(df)
# Close connection
conn.close()
Common Queries¶
-- Count phages by source
SELECT Source_DB, COUNT(*) as count
FROM fact_phages
GROUP BY Source_DB
ORDER BY count DESC;
-- Get phages by host
SELECT Phage_ID, Host, Length, GC_content
FROM fact_phages
WHERE Host LIKE '%Escherichia%'
LIMIT 10;
-- Find large phages
SELECT Phage_ID, Length, Host, Lifestyle
FROM fact_phages
WHERE Length > 200000
ORDER BY Length DESC
LIMIT 20;
-- Phage with protein count
SELECT
f.Phage_ID,
f.Length,
f.Host,
COUNT(p.Protein_ID) as protein_count
FROM fact_phages f
LEFT JOIN dim_proteins p ON f.Phage_ID = p.Phage_ID
GROUP BY f.Phage_ID, f.Length, f.Host
ORDER BY protein_count DESC
LIMIT 10;
-- tRNA type distribution
SELECT
trna_type,
COUNT(*) as count
FROM dim_trna_tmrna
WHERE trna_type IS NOT NULL
GROUP BY trna_type
ORDER BY count DESC;
Environment Setup¶
Conda¶
# Create environment
conda create -n pbi python=3.10
# Activate environment
conda activate pbi
# Deactivate environment
conda deactivate
# Remove environment
conda env remove -n pbi
# Export environment
conda env export > environment.yml
# Create from file
conda env create -f environment.yml
PBI Package¶
# Install in development mode
pip install -e .
# Install with specific extras
pip install -e ".[dev]"
# Verify installation
python -c "import pbi; print(pbi.__version__)"
# Uninstall
pip uninstall pbi
Snakemake Cache¶
# Create cache directory
mkdir -p /mnt/snakemake-cache
# Set cache location (add to ~/.bashrc for persistence)
export SNAKEMAKE_OUTPUT_CACHE=/mnt/snakemake-cache/
# Verify cache is set
echo $SNAKEMAKE_OUTPUT_CACHE
API Operations¶
Starting API¶
# Local (development with auto-reload)
cd api
uvicorn app:app --reload --host 0.0.0.0 --port 8000
# Local (production)
cd api
uvicorn app:app --host 0.0.0.0 --port 8000 --workers 4
# Docker
docker compose up -d api
Testing API¶
# Health check
curl http://localhost:8000/health
# Get statistics
curl http://localhost:8000/stats
# Custom query
curl -X POST http://localhost:8000/query \
-H "Content-Type: application/json" \
-d '{"query": "SELECT COUNT(*) FROM fact_phages"}'
# Get phages
curl -X POST http://localhost:8000/phages \
-H "Content-Type: application/json" \
-d '{"phage_ids": ["NC_000866"]}'
# Export to FASTA
curl -X POST http://localhost:8000/phages/fasta \
-H "Content-Type: application/json" \
-d '{"query": "SELECT Phage_ID FROM fact_phages LIMIT 5"}' \
> phages.fasta
File Operations¶
Check File Sizes¶
# Database size
du -h data/databases/phage_database_optimized.duckdb
# All databases
du -sh data/databases/*
# Sequence files
du -sh data/sequences/*
# Total data directory size
du -sh data/
Find Files¶
# Find all FASTA files
find data/ -name "*.fasta"
# Find all databases
find data/ -name "*.duckdb"
# Find all reports
find workflow/reports/ -name "*.html"
# Find large files (>1GB)
find data/ -type f -size +1G
Git Operations¶
# Check what would be committed
git status
# View changes
git diff
# View file history
git log --oneline -- path/to/file
# Discard changes to specific file
git checkout -- path/to/file
# Pull latest changes
git pull origin main
# Create feature branch
git checkout -b feature/my-feature
Jupyter¶
# Start Jupyter Lab
jupyter lab
# Start on specific port
jupyter lab --port 8889
# Start without browser
jupyter lab --no-browser
# List running servers
jupyter lab list
# Stop server
jupyter lab stop 8888
Process Management¶
# Find process by port
lsof -i :8000
# Kill process by PID
kill <PID>
# Kill by name (use with caution)
pkill -f "uvicorn"
# Monitor system resources
htop
# Check disk space
df -h
# Check memory usage
free -h
Debugging¶
Snakemake Debug Mode¶
# Verbose output
snakemake --directory workflow --snakefile workflow/Snakefile \
-v --printshellcmds --cores 4 --use-conda
# Keep going despite errors
snakemake --directory workflow --snakefile workflow/Snakefile \
--keep-going --cores 4 --use-conda
# Print execution reason
snakemake --directory workflow --snakefile workflow/Snakefile \
-p -r --cores 4 --use-conda
Python Debugging¶
# Enable debugging
import logging
logging.basicConfig(level=logging.DEBUG)
# In code
import pdb; pdb.set_trace() # Set breakpoint
# Better debugging with ipdb
import ipdb; ipdb.set_trace()
Docker Debugging¶
# Interactive shell in container
docker compose run --rm pipeline /bin/bash
# Check container logs
docker compose logs --tail 100 api
# Inspect running container
docker exec -it <container_id> /bin/bash
# View container processes
docker top <container_id>
Performance Monitoring¶
# Monitor pipeline execution
snakemake --directory workflow --snakefile workflow/Snakefile \
--cores 4 --use-conda --printshellcmds \
--benchmark-repeats 3
# Time command execution
time snakemake --directory workflow --snakefile workflow/Snakefile --cores 4
# Profile Python script
python -m cProfile -o output.prof script.py
# View profile
python -m pstats output.prof
Quick Workflows¶
Complete Fresh Installation¶
# 1. Clone repository
git clone https://github.com/ThibaultSchowing/PBI.git
cd PBI
# 2. Create conda environment
conda create -n pbi python=3.10
conda activate pbi
# 3. Install PBI package
pip install -e .
# 4. Run pipeline
snakemake --directory workflow --snakefile workflow/Snakefile \
--cores 4 --use-conda --printshellcmds
# 5. Verify
ls -lh data/databases/
ls -lh workflow/reports/
Docker Fresh Start¶
# 1. Build images
docker compose build
# 2. Run pipeline
docker compose run --rm pipeline
# 3. Start API
docker compose up -d api
# 4. Test
curl http://localhost:8000/health
Update Existing Installation¶
# 1. Pull latest code
git pull origin main
# 2. Update package
pip install -e . --upgrade
# 3. Re-run pipeline
snakemake --directory workflow --snakefile workflow/Snakefile \
--cores 4 --use-conda
# 4. Verify
curl http://localhost:8000/stats
Tip: Bookmark this page for quick reference when working with PBI!