Skip to content

Add majority voting plugin for candidate selection #208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion optillm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os

# Version information
__version__ = "0.1.19"
__version__ = "0.1.20"

# Get the path to the root optillm.py
spec = util.spec_from_file_location(
Expand Down
49 changes: 39 additions & 10 deletions optillm/bon.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,45 @@ def best_of_n_sampling(system_prompt: str, initial_query: str, client, model: st

completions = []

response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=4096,
n=n,
temperature=1
)
completions = [choice.message.content for choice in response.choices]
logger.info(f"Generated {len(completions)} initial completions. Tokens used: {response.usage.completion_tokens}")
bon_completion_tokens += response.usage.completion_tokens
try:
# Try to generate n completions in a single API call using n parameter
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=4096,
n=n,
temperature=1
)
completions = [choice.message.content for choice in response.choices]
logger.info(f"Generated {len(completions)} initial completions using n parameter. Tokens used: {response.usage.completion_tokens}")
bon_completion_tokens += response.usage.completion_tokens

except Exception as e:
logger.warning(f"n parameter not supported by provider: {str(e)}")
logger.info(f"Falling back to generating {n} completions one by one")

# Fallback: Generate completions one by one in a loop
for i in range(n):
try:
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=4096,
temperature=1
)
completions.append(response.choices[0].message.content)
bon_completion_tokens += response.usage.completion_tokens
logger.debug(f"Generated completion {i+1}/{n}")

except Exception as fallback_error:
logger.error(f"Error generating completion {i+1}: {str(fallback_error)}")
continue

if not completions:
logger.error("Failed to generate any completions")
return "Error: Could not generate any completions", 0

logger.info(f"Generated {len(completions)} completions using fallback method. Total tokens used: {bon_completion_tokens}")

# Rate the completions
rating_messages = messages.copy()
Expand Down
262 changes: 129 additions & 133 deletions optillm/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from optillm.cot_decoding import cot_decode
from optillm.entropy_decoding import entropy_decode
from optillm.thinkdeeper import thinkdeeper_decode
from optillm.thinkdeeper_mlx import thinkdeeper_decode_mlx
from optillm.autothink import autothink_decode

# Configure logging
Expand All @@ -33,6 +34,7 @@
import mlx.core as mx
from mlx_lm import load as mlx_load, generate as mlx_generate
from mlx_lm.tokenizer_utils import TokenizerWrapper
from mlx_lm.sample_utils import make_sampler
MLX_AVAILABLE = True
logger.info("MLX framework available")
except ImportError:
Expand Down Expand Up @@ -349,85 +351,46 @@ def generate(
return responses, token_counts, logprobs_results

def _robust_mlx_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float, repetition_penalty: float) -> str:
"""Robust MLX generation with multiple parameter combinations"""

# Try different parameter combinations based on MLX-LM version
parameter_combinations = [
# Version 1: Current style with positional args and temp
{
"style": "positional_temp",
"args": (self.model, self.tokenizer, prompt),
"kwargs": {
"max_tokens": max_tokens,
"temp": temperature,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"verbose": False
}
},
# Version 2: All keyword arguments with temp
{
"style": "keyword_temp",
"args": (),
"kwargs": {
"model": self.model,
"tokenizer": self.tokenizer,
"prompt": prompt,
"max_tokens": max_tokens,
"temp": temperature,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"verbose": False
}
},
# Version 3: Using temperature instead of temp
{
"style": "positional_temperature",
"args": (self.model, self.tokenizer, prompt),
"kwargs": {
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"verbose": False
}
},
# Version 4: Minimal parameters only
{
"style": "minimal",
"args": (self.model, self.tokenizer, prompt),
"kwargs": {
"max_tokens": max_tokens,
"temp": temperature,
"verbose": False
}
},
# Version 5: Just essential parameters
{
"style": "essential",
"args": (self.model, self.tokenizer, prompt),
"kwargs": {
"max_tokens": max_tokens
}
}
]
"""Robust MLX generation using sampler approach"""

last_error = None

for combo in parameter_combinations:
try:
# Create sampler with generation parameters
sampler = make_sampler(
temp=temperature,
top_p=top_p,
min_p=0.0, # Default min_p
min_tokens_to_keep=1 # Default min_tokens_to_keep
)

# Generate using the sampler
response = mlx_generate(
self.model,
self.tokenizer,
prompt,
max_tokens=max_tokens,
sampler=sampler,
verbose=False
)

return response

except Exception as e:
logger.error(f"MLX generation with sampler failed: {str(e)}")

# Fallback: Try minimal parameters without sampler
try:
logger.debug(f"Trying MLX generation with style: {combo['style']}")
response = mlx_generate(*combo["args"], **combo["kwargs"])
logger.debug(f"Successfully generated with style: {combo['style']}")
logger.debug("Attempting MLX generation without sampler")
response = mlx_generate(
self.model,
self.tokenizer,
prompt,
max_tokens=max_tokens,
verbose=False
)
return response

except Exception as e:
last_error = e
logger.debug(f"Failed with style {combo['style']}: {str(e)}")
continue

# If all combinations failed, raise the last error
raise RuntimeError(f"All MLX generation methods failed. Last error: {str(last_error)}")
except Exception as fallback_e:
logger.error(f"MLX fallback generation also failed: {str(fallback_e)}")
raise

def format_chat_prompt(self, system_prompt: str, user_prompt: str) -> str:
"""Format the prompt according to model's chat template"""
Expand Down Expand Up @@ -1691,37 +1654,47 @@ def create(
if decoding:
logger.info(f"Using specialized decoding approach: {decoding}")

# Ensure model is in eval mode and on correct device
pipeline.current_model.eval()
device = pipeline.current_model.device
# Check if this decoding approach is supported for MLX
mlx_unsupported_decodings = ["cot_decoding", "entropy_decoding", "autothink"]
if isinstance(pipeline, MLXInferencePipeline) and decoding in mlx_unsupported_decodings:
logger.warning(f"{decoding} is not supported for MLX models. Falling back to standard generation.")
decoding = None

if decoding:
# For PyTorch pipelines, ensure model is in eval mode and get device
# MLX pipelines handle this differently
if not isinstance(pipeline, MLXInferencePipeline):
pipeline.current_model.eval()
device = pipeline.current_model.device
else:
device = None # MLX doesn't use torch devices

if decoding == "cot_decoding":
# Use directly available parameters for CoT
cot_params = {
"k": k,
"num_beams": num_beams,
"max_new_tokens": max_tokens if max_tokens is not None else 512,
"temperature": temperature,
"top_p": top_p,
"repetition_penalty": 1.0,
"length_penalty": length_penalty,
"no_repeat_ngram_size": no_repeat_ngram_size,
"early_stopping": early_stopping,
"aggregate_paths": aggregate_paths,
}

result, confidence = cot_decode(
pipeline.current_model,
pipeline.tokenizer,
messages,
**cot_params
)
responses = [result]
logprobs_results = [{"confidence_score": confidence} if confidence is not None else None]
completion_tokens = len(pipeline.tokenizer.encode(result))
cot_params = {
"k": k,
"num_beams": num_beams,
"max_new_tokens": max_tokens if max_tokens is not None else 512,
"temperature": temperature,
"top_p": top_p,
"repetition_penalty": 1.0,
"length_penalty": length_penalty,
"no_repeat_ngram_size": no_repeat_ngram_size,
"early_stopping": early_stopping,
"aggregate_paths": aggregate_paths,
}
result, confidence = cot_decode(
pipeline.current_model,
pipeline.tokenizer,
messages,
**cot_params
)
responses = [result]
logprobs_results = [{"confidence_score": confidence} if confidence is not None else None]
completion_tokens = len(pipeline.tokenizer.encode(result))

elif decoding == "entropy_decoding":

# Ensure model is using full precision
original_dtype = pipeline.current_model.dtype
pipeline.current_model = pipeline.current_model.to(torch.float32)
Expand Down Expand Up @@ -1778,43 +1751,66 @@ def create(
}
thinkdeeper_config.update(custom_config)

result = thinkdeeper_decode(
pipeline.current_model,
pipeline.tokenizer,
messages,
thinkdeeper_config
# Check if we're using MLX pipeline
if isinstance(pipeline, MLXInferencePipeline):
logger.info("Using MLX ThinkDeeper implementation")

# Ensure we have enough tokens for thinking + response
user_max_tokens = max_tokens if max_tokens is not None else 512
total_tokens_needed = max_thinking_tokens + 512 # thinking + response buffer
adjusted_max_tokens = max(user_max_tokens, total_tokens_needed)

# Add max_tokens to thinkdeeper config
thinkdeeper_config_with_tokens = thinkdeeper_config.copy()
thinkdeeper_config_with_tokens["max_tokens"] = adjusted_max_tokens

logger.debug(f"ThinkDeeper tokens: user={user_max_tokens}, thinking={max_thinking_tokens}, adjusted={adjusted_max_tokens}")

result = thinkdeeper_decode_mlx(
pipeline.model,
pipeline.tokenizer,
messages,
thinkdeeper_config_with_tokens
)
else:
logger.info("Using PyTorch ThinkDeeper implementation")
result = thinkdeeper_decode(
pipeline.current_model,
pipeline.tokenizer,
messages,
thinkdeeper_config
)
responses = [result]
logprobs_results = [None]
completion_tokens = len(pipeline.tokenizer.encode(result))
elif decoding == "autothink":
# Get steering dataset configuration
steering_dataset = kwargs.get("steering_dataset", "codelion/Qwen3-0.6B-pts-steering-vectors")
target_layer = kwargs.get("target_layer", 19)

# Prepare AutoThink configuration
autothink_config = {
"steering_dataset": steering_dataset,
"target_layer": target_layer,
"pattern_strengths": kwargs.get("pattern_strengths", {
"depth_and_thoroughness": 2.5,
"numerical_accuracy": 2.0,
"self_correction": 3.0,
"exploration": 2.0,
"organization": 1.5
})
}

# Process with AutoThink
result = autothink_decode(
pipeline.current_model,
pipeline.tokenizer,
messages,
autothink_config
)
responses = [result]
logprobs_results = [None]
completion_tokens = len(pipeline.tokenizer.encode(result))
steering_dataset = kwargs.get("steering_dataset", "codelion/Qwen3-0.6B-pts-steering-vectors")
target_layer = kwargs.get("target_layer", 19)
# Prepare AutoThink configuration
autothink_config = {
"steering_dataset": steering_dataset,
"target_layer": target_layer,
"pattern_strengths": kwargs.get("pattern_strengths", {
"depth_and_thoroughness": 2.5,
"numerical_accuracy": 2.0,
"self_correction": 3.0,
"exploration": 2.0,
"organization": 1.5
})
}
# Process with AutoThink
result = autothink_decode(
pipeline.current_model,
pipeline.tokenizer,
messages,
autothink_config
)
responses = [result]
logprobs_results = [None]
completion_tokens = len(pipeline.tokenizer.encode(result))
else:
raise ValueError(f"Unknown specialized decoding approach: {decoding}")

Expand Down
Loading