codelion · codelion · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,4 @@ cython_debug/
 .vscode/
 
 scripts/results/
+results/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,133 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+OptILLM is an OpenAI API compatible optimizing inference proxy that implements state-of-the-art techniques to improve accuracy and performance of LLMs. It focuses on reasoning improvements for coding, logical, and mathematical queries through inference-time compute optimization.
+
+## Core Architecture
+
+### Main Components
+
+1. **Entry Points**: 
+   - `optillm.py` - Main Flask server with inference routing
+   - `optillm/inference.py` - Local inference engine with transformer models
+   - Setup via `setup.py` with console script `optillm=optillm:main`
+
+2. **Optimization Techniques** (`optillm/`):
+   - **Reasoning**: `cot_reflection.py`, `plansearch.py`, `leap.py`, `reread.py` 
+   - **Sampling**: `bon.py` (Best of N), `moa.py` (Mixture of Agents), `self_consistency.py`
+   - **Search**: `mcts.py` (Monte Carlo Tree Search), `rstar.py` (R* Algorithm)
+   - **Verification**: `pvg.py` (Prover-Verifier Game), `z3_solver.py`
+   - **Advanced**: `cepo/` (Cerebras Planning & Optimization), `rto.py` (Round Trip)
+
+3. **Decoding Techniques**:
+   - `cot_decoding.py` - Chain-of-thought without explicit prompting
+   - `entropy_decoding.py` - Adaptive sampling based on token uncertainty
+   - `thinkdeeper.py` - Reasoning effort scaling
+   - `autothink/` - Query complexity classification with steering vectors
+
+4. **Plugin System** (`optillm/plugins/`):
+   - `spl/` - System Prompt Learning (third paradigm learning)
+   - `deepthink/` - Gemini-like deep thinking with inference scaling
+   - `longcepo/` - Long-context processing with divide-and-conquer
+   - `mcp_plugin.py` - Model Context Protocol client
+   - `memory_plugin.py` - Short-term memory for unbounded context
+   - `privacy_plugin.py` - PII anonymization/deanonymization
+   - `executecode_plugin.py` - Code interpreter integration
+   - `json_plugin.py` - Structured outputs with outlines library
+
+## Development Commands
+
+### Installation & Setup
+```bash
+# Development setup
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+
+# Package installation
+pip install optillm
+```
+
+### Running the Server
+```bash
+# Basic server (auto approach detection)
+python optillm.py
+
+# With specific approach
+python optillm.py --approach moa --model gpt-4o-mini
+
+# With external endpoint
+python optillm.py --base_url http://localhost:8080/v1
+
+# Docker
+docker compose up -d
+```
+
+### Testing
+```bash
+# Run all approach tests
+python test.py
+
+# Test specific approaches
+python test.py --approaches moa bon mcts
+
+# Test with specific model/endpoint
+python test.py --model gpt-4o-mini --base-url http://localhost:8080/v1
+
+# Single test case
+python test.py --single-test "specific_test_name"
+```
+
+### Evaluation Scripts
+```bash
+# Math benchmark evaluation
+python scripts/eval_math500_benchmark.py
+
+# AIME benchmark
+python scripts/eval_aime_benchmark.py
+
+# Arena Hard Auto evaluation  
+python scripts/eval_arena_hard_auto_rtc.py
+
+# FRAMES benchmark
+python scripts/eval_frames_benchmark.py
+
+# OptILLM benchmark generation/evaluation
+python scripts/gen_optillmbench.py
+python scripts/eval_optillmbench.py
+```
+
+## Usage Patterns
+
+### Approach Selection (Priority Order)
+1. **Model prefix**: `moa-gpt-4o-mini` (approach slug + model name)
+2. **extra_body field**: `{"optillm_approach": "bon|moa|mcts"}`
+3. **Prompt tags**: `<optillm_approach>re2</optillm_approach>` in system/user prompt
+
+### Approach Combinations
+- **Pipeline** (`&`): `cot_reflection&moa` - sequential processing
+- **Parallel** (`|`): `bon|moa|mcts` - multiple responses returned as list
+
+### Local Inference
+- Set `OPTILLM_API_KEY=optillm` to enable built-in transformer inference
+- Supports HuggingFace models with LoRA adapters: `model+lora1+lora2`
+- Advanced decoding: `{"decoding": "cot_decoding", "k": 10}`
+
+### Plugin Configuration
+- MCP: `~/.optillm/mcp_config.json` for Model Context Protocol servers
+- SPL: Built-in system prompt learning for solving strategies
+- Memory: Automatic unbounded context via chunking and retrieval
+
+## Key Concepts
+
+### Inference Optimization
+The proxy intercepts OpenAI API calls and applies optimization techniques before forwarding to LLM providers (OpenAI, Cerebras, Azure, LiteLLM). Each technique implements specific reasoning or sampling improvements.
+
+### Plugin Architecture
+Plugins extend functionality via standardized interfaces. They can modify requests, process responses, add tools, or provide entirely new capabilities like code execution or structured outputs.
+
+### Multi-Provider Support
+Automatically detects and routes to appropriate LLM provider based on environment variables (`OPENAI_API_KEY`, `CEREBRAS_API_KEY`, etc.) with fallback to LiteLLM for broader model support.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1,3 @@
 include optillm/plugins/*.py
+include optillm/cepo/*.py
+include optillm/cepo/configs/*.yaml
diff --git a/optillm/__init__.py b/optillm/__init__.py
@@ -2,7 +2,7 @@
 import os
 
 # Version information
-__version__ = "0.1.20"
+__version__ = "0.1.21"
 
 # Get the path to the root optillm.py
 spec = util.spec_from_file_location(

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,71 @@
+[build-system]
+requires = ["setuptools>=64", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "optillm"
+version = "0.1.21"
+description = "An optimizing inference proxy for LLMs."
+readme = "README.md"
+license = "Apache-2.0"
+authors = [
+    {name = "codelion", email = "[email protected]"}
+]
+requires-python = ">=3.10"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "numpy",
+    "networkx",
+    "openai",
+    "z3-solver",
+    "aiohttp",
+    "flask",
+    "torch",
+    "transformers",
+    "azure-identity",
+    "tiktoken",
+    "scikit-learn",
+    "litellm",
+    "requests",
+    "beautifulsoup4",
+    "lxml",
+    "presidio_analyzer",
+    "presidio_anonymizer",
+    "nbconvert",
+    "nbformat",
+    "ipython",
+    "ipykernel",
+    "peft",
+    "bitsandbytes",
+    "gradio<5.16.0",
+    # Constrain spacy version to avoid blis build issues on ARM64
+    "spacy<3.8.0",
+    "cerebras_cloud_sdk",
+    "outlines[transformers]",
+    "sentencepiece",
+    "mcp",
+    "adaptive-classifier",
+    # MLX support for Apple Silicon optimization
+    'mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"',
+]
+
+[project.urls]
+Homepage = "https://github.com/codelion/optillm"
+Repository = "https://github.com/codelion/optillm"
+Issues = "https://github.com/codelion/optillm/issues"
+
+[project.scripts]
+optillm = "optillm:main"
+
+[tool.setuptools.packages.find]
+include = ["optillm*"]
+
+[tool.setuptools.package-data]
+optillm = [
+    "plugins/*.py",
+    "cepo/*.py",
+    "cepo/configs/*.yaml",
+]
diff --git a/setup.py b/setup.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -169,3 +169,4 @@ cython_debug/
		.vscode/

		scripts/results/
		results/