Models API Reference

Complete API documentation for the MARSYS model system, providing unified interfaces for local and API-based language models.

Model Selection Guide

For guidance on choosing models and when to use VLM, see the Models Concept Guide.

ModelConfig

Configuration schema for all model types using Pydantic validation.

Class Definition

from pydantic import BaseModel, Field
from typing import Literal, Optional, Dict, Any
class ModelConfig(BaseModel):
"""Unified configuration for all model types."""
# Core settings
type: Literal["local", "api"] = Field(
description="Model type - local or API-based"
)
name: str = Field(
description="Model identifier or HuggingFace path"
)
# API settings
provider: Optional[str] = Field(
default=None,
description="API provider (openai, anthropic, google, openrouter, xai, openai-oauth, anthropic-oauth)"
)
base_url: Optional[str] = Field(
default=None,
description="Custom API endpoint URL"
)
api_key: Optional[str] = Field(
default=None,
description="API key (auto-loaded from env if None)"
)
oauth_profile: Optional[str] = Field(
default=None,
description="OAuth profile name for openai-oauth / anthropic-oauth"
)
# Generation parameters
max_tokens: int = Field(default=8192, description="Maximum output tokens")
temperature: float = Field(default=0.7, ge=0.0, le=2.0, description="Sampling temperature")
top_p: float = Field(default=1.0, ge=0.0, le=1.0, description="Nucleus sampling parameter")
frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0, description="Frequency penalty")
presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0, description="Presence penalty")
# Reasoning parameters
thinking_budget: Optional[int] = Field(
default=1024,
description="Token budget for extended thinking (models with thinking support)"
)
reasoning_effort: Optional[str] = Field(
default="low",
description="Reasoning effort level (low, medium, high)"
)
# Local model settings
model_class: Optional[Literal["llm", "vlm"]] = Field(
default=None,
description="Local model class (required for type='local')"
)
backend: Optional[Literal["huggingface", "vllm"]] = Field(
default="huggingface",
description="Backend: 'huggingface' (dev) or 'vllm' (production)"
)
torch_dtype: str = Field(default="auto", description="PyTorch dtype")
device_map: str = Field(default="auto", description="Device mapping strategy (HuggingFace only)")
# vLLM-specific settings
tensor_parallel_size: Optional[int] = Field(default=1, description="Number of GPUs for tensor parallelism")
gpu_memory_utilization: Optional[float] = Field(default=0.9, description="GPU memory utilization fraction 0-1")
quantization: Optional[Literal["awq", "gptq", "fp8"]] = Field(default=None, description="Quantization method")
# Additional parameters
parameters: Dict[str, Any] = Field(default_factory=dict, description="Provider-specific parameters")

Usage Examples

from marsys.models import ModelConfig
# OpenAI GPT-5 Codex
gpt5_config = ModelConfig(
type="api",
provider="openrouter",
name="openai/gpt-5-codex",
temperature=0.7,
max_tokens=12000
)
# Anthropic Claude Opus 4.6
claude_config = ModelConfig(
type="api",
provider="openrouter",
name="anthropic/claude-opus-4.6",
temperature=0.5,
max_tokens=12000
)
# Local LLM (HuggingFace backend)
llm_config = ModelConfig(
type="local",
name="Qwen/Qwen3-4B-Instruct-2507",
model_class="llm",
backend="huggingface",
torch_dtype="bfloat16",
device_map="auto",
max_tokens=4096
)
# Local VLM (vLLM backend for production)
vlm_config = ModelConfig(
type="local",
name="Qwen/Qwen3-VL-8B-Instruct",
model_class="vlm",
backend="vllm",
tensor_parallel_size=2,
gpu_memory_utilization=0.9,
quantization="fp8",
max_tokens=4096
)
# Custom API endpoint
custom_config = ModelConfig(
type="api",
name="custom-model",
base_url="https://api.mycompany.com/v1",
api_key="custom-key",
parameters={"custom_param": "value"}
)

OAuth Providers (No API Keys)

MARSYS supports OAuth-backed providers that use local CLI credentials instead of API keys:

  • openai-oauth: ChatGPT subscription via Codex CLI (codex login)
  • anthropic-oauth: Claude Max subscription via Claude CLI (claude login)

Credentials are read from local files and can be overridden with environment variables:

  • OpenAI OAuth: ~/.codex/auth.json (override with CODEX_AUTH_PATH)
  • Anthropic OAuth: ~/.claude/.credentials.json (override with CLAUDE_AUTH_PATH)
# OpenAI ChatGPT OAuth (Codex CLI)
openai_oauth = ModelConfig(
type="api",
provider="openai-oauth",
name="gpt-5.3-codex",
credentials_path="~/.codex/auth.json" # Optional override
)
# Anthropic Claude OAuth (Claude CLI)
anthropic_oauth = ModelConfig(
type="api",
provider="anthropic-oauth",
name="claude-opus-4-6",
credentials_path="~/.claude/.credentials.json" # Optional override
)

Use At Your Own Risk (Anthropic OAuth)

anthropic-oauth relies on a non-official integration path and may violate provider Terms of Service. Use at your own risk.

OpenAI OAuth Compliance

MARSYS does not make a legal determination about OpenAI ToS coverage for this OAuth path. Review OpenAI terms for your use case.

Model Classes

Local Model Architecture

MARSYS uses an adapter pattern for local models, supporting two backends:

┌──────────────────────────────┐
│ BaseLocalModel │
│ (Unified Interface) │
└────────────┬─────────────────┘
┌────────────┴─────────────────┐
│ LocalAdapterFactory │
└────────────┬─────────────────┘
┌───────────────────────┼───────────────────────┐
▼ ▼ ▼
┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐
│ HuggingFaceLLM │ │ HuggingFaceVLM │ │ VLLMAdapter │
│ Adapter │ │ Adapter │ │ (LLM & VLM) │
└──────────────────┘ └──────────────────┘ └──────────────────┘

BaseLocalModel

Unified interface for local models. Recommended for most use cases.

from marsys.models import BaseLocalModel
class BaseLocalModel:
"""Base class for local models using adapter pattern."""
def __init__(
self,
model_name: str,
model_class: str = "llm",
backend: str = "huggingface",
max_tokens: int = 1024,
thinking_budget: Optional[int] = None,
**kwargs
):
"""
Args:
model_name: HuggingFace model identifier
model_class: "llm" or "vlm"
backend: "huggingface" or "vllm"
max_tokens: Maximum generation tokens
thinking_budget: Token budget for thinking models
**kwargs: Backend-specific parameters
"""

run(messages, **kwargs) -> Dict[str, Any]

Execute the model synchronously.

ParameterTypeDescription
messagesList[Dict]Conversation messages
json_modeboolEnable JSON output mode
max_tokensOptional[int]Override max tokens
toolsOptional[List[Dict]]Tool definitions
imagesOptional[List]Images for VLM

Returns:

{
"role": "assistant",
"content": "Generated response text",
"thinking": "Optional thinking content for thinking models",
"tool_calls": []
}

arun(messages, **kwargs) -> HarmonizedResponse

Execute the model asynchronously.

Example

from marsys.models import BaseLocalModel
# HuggingFace backend (development)
model = BaseLocalModel(
model_name="Qwen/Qwen3-4B-Instruct-2507",
model_class="llm",
backend="huggingface",
torch_dtype="bfloat16",
device_map="auto",
max_tokens=4096
)
response = model.run(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum computing"}
]
)
print(response["content"])
# vLLM backend (production)
vlm_model = BaseLocalModel(
model_name="Qwen/Qwen3-VL-8B-Instruct",
model_class="vlm",
backend="vllm",
tensor_parallel_size=2,
gpu_memory_utilization=0.9,
max_tokens=4096
)

LocalProviderAdapter

Abstract base class for local model adapters. Used internally by BaseLocalModel.

class LocalProviderAdapter(ABC):
"""Abstract base class for local model provider adapters."""
# Training access (HuggingFace only)
model: Any = None # Raw PyTorch model
tokenizer: Any = None # HuggingFace tokenizer
@property
def supports_training(self) -> bool:
"""True for HuggingFace adapters, False for vLLM."""
@property
def backend(self) -> str:
"""Backend name: 'huggingface' or 'vllm'."""

HuggingFaceLLMAdapter

Adapter for text-only language models using HuggingFace transformers.

from marsys.models import HuggingFaceLLMAdapter
adapter = HuggingFaceLLMAdapter(
model_name="Qwen/Qwen3-4B-Instruct-2507",
max_tokens=4096,
torch_dtype="bfloat16",
device_map="auto",
thinking_budget=256,
trust_remote_code=True
)
# Access for training
pytorch_model = adapter.model # AutoModelForCausalLM
tokenizer = adapter.tokenizer # AutoTokenizer

HuggingFaceVLMAdapter

Adapter for vision-language models using HuggingFace transformers.

from marsys.models import HuggingFaceVLMAdapter
adapter = HuggingFaceVLMAdapter(
model_name="Qwen/Qwen3-VL-8B-Instruct",
max_tokens=4096,
torch_dtype="bfloat16",
device_map="auto",
thinking_budget=256
)
# Process images in messages
response = adapter.run(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{"type": "image_url", "image_url": {"url": "path/to/image.jpg"}}
]
}
]
)

VLLMAdapter

Adapter for high-throughput production inference using vLLM.

from marsys.models import VLLMAdapter
adapter = VLLMAdapter(
model_name="Qwen/Qwen3-VL-8B-Instruct",
model_class="vlm",
max_tokens=4096,
tensor_parallel_size=2, # Multi-GPU
gpu_memory_utilization=0.9, # Memory fraction
quantization="fp8", # awq, gptq, fp8
trust_remote_code=True
)
# Note: vLLM doesn't support training
assert not adapter.supports_training

LocalAdapterFactory

Factory to create the appropriate adapter.

from marsys.models import LocalAdapterFactory
# Create HuggingFace LLM adapter
adapter = LocalAdapterFactory.create_adapter(
backend="huggingface",
model_name="Qwen/Qwen3-4B-Instruct-2507",
model_class="llm",
torch_dtype="bfloat16",
device_map="auto"
)
# Create vLLM VLM adapter
adapter = LocalAdapterFactory.create_adapter(
backend="vllm",
model_name="Qwen/Qwen3-VL-8B-Instruct",
model_class="vlm",
tensor_parallel_size=2
)

BaseAPIModel

Base class for API-based models.

class BaseAPIModel:
"""API model wrapper."""
def __init__(
self,
provider: str,
model_name: str,
api_key: Optional[str] = None,
base_url: Optional[str] = None,
max_tokens: int = 1024,
**kwargs
):
"""
Args:
provider: API provider name
model_name: Model identifier
api_key: API key (auto-loaded from env if None)
base_url: Custom endpoint URL
max_tokens: Maximum tokens
**kwargs: Provider-specific parameters
"""

Supported Providers

ProviderModelsEnvironment Variable
openrouterAll major modelsOPENROUTER_API_KEY
openaigpt-5-codex, etc.OPENAI_API_KEY
openai-oauthgpt-5.3-codexcodex login (~/.codex/auth.json)
anthropicclaude-opus-4-6, claude-opus-4.6 (alias), etc.ANTHROPIC_API_KEY
anthropic-oauthclaude-opus-4-6claude login (~/.claude/.credentials.json)
googlegemini-3-flash-preview, gemini-3-pro-preview, etc.GOOGLE_API_KEY
xaigrok-4, grok-4-fast, grok-3, etc.XAI_API_KEY

run(messages, **kwargs) -> Dict[str, Any]

Execute API model.

ParameterTypeDescription
messagesList[Dict]Conversation messages
json_modeboolForce JSON response (non-schema mode)
response_schemaOptional[Dict]Strict JSON schema for structured output
toolsOptional[List[Dict]]Function definitions
tool_choiceOptional[str]Tool selection strategy
from marsys.models import BaseAPIModel
model = BaseAPIModel(
provider="openrouter",
model_name="anthropic/claude-opus-4.6",
temperature=0.7,
max_tokens=12000
)
response = await model.run(
messages=[
{"role": "user", "content": "Hello!"}
],
tools=[{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"}
},
"required": ["location"]
}
}
}]
)
if response.get("tool_calls"):
for tool_call in response["tool_calls"]:
print(f"Tool: {tool_call['function']['name']}")
print(f"Args: {tool_call['function']['arguments']}")

Model Factory

API Models

Use BaseAPIModel.from_config():

from marsys.models import BaseAPIModel, ModelConfig
config = ModelConfig(
type="api",
provider="openrouter",
name="anthropic/claude-opus-4.6",
max_tokens=12000
)
model = BaseAPIModel.from_config(config)
response = await model.arun(messages=[{"role": "user", "content": "Hello!"}])

Local Models

Use BaseLocalModel:

from marsys.models import BaseLocalModel, ModelConfig
config = ModelConfig(
type="local",
model_class="llm",
name="Qwen/Qwen3-4B-Instruct-2507",
backend="huggingface",
torch_dtype="bfloat16",
device_map="auto"
)
model = BaseLocalModel(
model_name=config.name,
model_class=config.model_class,
backend=config.backend,
torch_dtype=config.torch_dtype,
device_map=config.device_map,
max_tokens=config.max_tokens
)
response = model.run(messages=[{"role": "user", "content": "Hello!"}])

Advanced Features

Tool Calling

Models support OpenAI-compatible function calling:

tools = [
{
"type": "function",
"function": {
"name": "search_web",
"description": "Search the web for information",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query"
},
"max_results": {
"type": "integer",
"description": "Maximum results",
"default": 5
}
},
"required": ["query"]
}
}
}
]
response = await model.run(
messages=[
{"role": "user", "content": "Find information about Mars rovers"}
],
tools=tools,
tool_choice="auto"
)
# Handle tool calls
if response.get("tool_calls"):
for call in response["tool_calls"]:
if call["function"]["name"] == "search_web":
args = json.loads(call["function"]["arguments"])
results = search_web(args["query"], args.get("max_results", 5))
# Add tool result to conversation
messages.append({
"role": "tool",
"content": json.dumps(results),
"tool_call_id": call["id"]
})

JSON Mode

Force structured JSON output:

response = await model.run(
messages=[
{
"role": "system",
"content": "Always respond with JSON: {\"answer\": str, \"confidence\": float}"
},
{
"role": "user",
"content": "What is 2+2?"
}
],
json_mode=True
)
data = json.loads(response["content"])
print(f"Answer: {data['answer']} (Confidence: {data['confidence']})")

Structured Output (response_schema)

Use response_schema for strict schema-constrained JSON:

schema = {
"type": "object",
"properties": {
"answer": {"type": "string"},
"confidence": {"type": "number"},
},
"required": ["answer", "confidence"],
}
response = await model.run(
messages=[{"role": "user", "content": "What is 2+2?"}],
response_schema=schema,
)

Provider behavior:

  • OpenAI / OpenRouter / OpenAI OAuth: native JSON schema mode
  • Google: responseSchema in generation config
  • Anthropic / Anthropic OAuth: native output_config.format JSON schema
  • response_schema takes precedence over json_mode

Streaming Responses

async for chunk in model.stream(
messages=[{"role": "user", "content": "Write a story"}]
):
print(chunk["content"], end="", flush=True)

Error Handling

Automatic Retry for Server Errors

Built-in Resilience

API adapters automatically retry transient server errors with exponential backoff. No manual retry needed!

Configuration: Max Retries: 3 (total 4 attempts), Backoff: 1s, 2s, 4s (exponential)

Retryable Status Codes:

  • 500 - Internal Server Error
  • 502 - Bad Gateway
  • 503 - Service Unavailable
  • 504 - Gateway Timeout
  • 529 - Overloaded (Anthropic)
  • 408 - Request Timeout (OpenRouter)
  • 429 - Rate Limit (respects retry-after header)

Provider-Specific Retry Behavior

ProviderRetryable ErrorsNon-Retryable Errors
OpenRouter408, 429, 502, 503, 500+400, 401, 402, 403
OpenAI429, 500, 502, 503400, 401, 404
Anthropic429, 500, 529400, 401, 403, 413
Google429, 500, 503, 504400, 403, 404

Manual Error Handling

For errors that aren't automatically retried (client errors, quota issues, etc.):

from marsys.agents.exceptions import (
ModelError,
ModelAPIError,
ModelTimeoutError,
ModelRateLimitError,
ModelTokenLimitError
)
try:
response = await model.run(messages)
except ModelRateLimitError as e:
logger.error(f"Rate limit exceeded after retries")
if e.retry_after:
logger.info(f"Retry after {e.retry_after}s")
except ModelTokenLimitError as e:
logger.warning(f"Token limit exceeded: {e.message}")
messages = truncate_messages(messages, e.limit)
response = await model.run(messages)
except ModelAPIError as e:
if e.status_code and e.status_code >= 500:
logger.error(f"Server error persisted after retries: {e.message}")
else:
logger.error(f"Client error: {e.status_code} - {e.message}")

Error Classification

All ModelAPIError instances include classification:

except ModelAPIError as e:
print(f"Error Code: {e.error_code}")
print(f"Classification: {e.classification}")
print(f"Is Retryable: {e.is_retryable}")
print(f"Retry After: {e.retry_after}s")
print(f"Suggested Action: {e.suggested_action}")

Best Practices

Configuration Management

# GOOD - Environment-based config
import os
from marsys.models import ModelConfig
config = ModelConfig(
type="api",
provider="openrouter",
name=os.getenv("MODEL_NAME", "anthropic/claude-opus-4.6"),
temperature=float(os.getenv("MODEL_TEMPERATURE", "0.7")),
max_tokens=int(os.getenv("MAX_TOKENS", "12000"))
)
# BAD - Hardcoded values
config = ModelConfig(
type="api",
provider="openrouter",
name="anthropic/claude-opus-4.6",
api_key="sk-..." # Never hardcode!
)

Error Recovery

# GOOD - Graceful degradation
async def robust_model_call(messages, fallback_model=None):
try:
return await primary_model.run(messages)
except ModelError as e:
if fallback_model:
logger.warning(f"Primary failed, using fallback: {e}")
return await fallback_model.run(messages)
raise
# BAD - No error handling
response = await model.run(messages) # Can fail!

Related Documentation