Python SDK
Every Keystone Python class, method, and option — what each does, when to use it, what it returns.
polarity-keystone — the Python SDK. Pure standard library; no dependencies beyond urllib, json, and threading. Compatible with Python 3.9+.
Install
pip install polarity-keystone
# or
uv add polarity-keystone
# or
poetry add polarity-keystoneKeystone — the client
from polarity_keystone import Keystone
ks = Keystone(
api_key="ks_live_...", # optional — falls back to KEYSTONE_API_KEY
base_url="https://keystone.polarity.so", # default
timeout=30, # seconds
)After construction, the nine services hang off the client: ks.sandboxes, ks.specs, ks.experiments, ks.alerts, ks.agents, ks.datasets, ks.scoring, ks.export, ks.prompts.
ks.wrap(client, sandbox_id=None, *, tracing=True, auto_instrument=True) — wrap an LLM client
from anthropic import Anthropic
ks = Keystone()
anthropic = ks.wrap(Anthropic())
# Every anthropic.messages.create() now auto-reports.Three actions, controllable via flags:
| Flag | Default | Effect |
|---|---|---|
tracing | True | Initialize @traced span reporting on the current thread |
auto_instrument | True | Patch every importable LLM provider in the process — OpenAI, Anthropic, Mistral, Google GenAI, LiteLLM, LangChain, Claude Agent SDK, DSPy |
sandbox_id | None | Falls back to KEYSTONE_SANDBOX_ID env var; if absent, agent mode (events scoped by API key) |
# Wrap-only — no global tracing, no auto_instrument
ks.wrap(Anthropic(), tracing=False, auto_instrument=False)
# Pin sandbox id explicitly
ks.wrap(Anthropic(), sandbox_id="sb-explicit")ks.init_tracing(sandbox_id=None) — set up @traced only
ks.init_tracing() # picks up KEYSTONE_SANDBOX_ID
ks.init_tracing("sb-explicit")No-op without sandbox id and API key. Use when you don't have an LLM client to wrap.
ks.observe(*, clients=None, tracing=True, auto_instrument=True, sandbox_id=None) — one-call observability
applied = ks.observe(clients=[Anthropic(), OpenAI()])
# applied: ['anthropic-client', 'openai-client', 'tracing', 'openai', 'anthropic', ...]Wraps every client, initializes @traced, auto-instruments every importable framework. Returns instrumentation labels.
Keystone.from_sandbox() — inside the sandbox
ks, sb = Keystone.from_sandbox()
db = sb.services["db"] # ServiceInfo(host="db", port=5432, ready=True)Reads KEYSTONE_SANDBOX_ID, KEYSTONE_API_KEY, KEYSTONE_BASE_URL from the env. Raises KeystoneError if KEYSTONE_SANDBOX_ID isn't set.
ks.health()
status = ks.health() # {"status": "ok"}GET /health. Quick liveness check.
SandboxService
ks.sandboxes.create(spec_id, timeout=None, metadata=None, *, secrets=None, spec_path=None)
sb = ks.sandboxes.create(
spec_id="fix-failing-test",
timeout="10m",
metadata={"run": "ci-7821"},
secrets={"ANTHROPIC_API_KEY": "..."},
spec_path="./specs/fix-failing-test.yaml", # auto-forwards declared secrets
)
# Returns Sandbox dataclassWhen spec_path is provided, the SDK reads the spec's secrets: block, calls collect_declared_secrets_from_file(), and merges into the request body. Explicit secrets win on collision.
Other sandbox methods
ks.sandboxes.get("sb-abc")
ks.sandboxes.list() # list[Sandbox]
ks.sandboxes.destroy("sb-abc")
ks.sandboxes.run_command("sb-abc", "npm test", timeout="2m") # CommandResult
ks.sandboxes.read_file("sb-abc", "src/main.ts") # bytes
ks.sandboxes.write_file("sb-abc", "config.json", '{"x": 1}')
ks.sandboxes.delete_file("sb-abc", "tmp/cache.bin")
ks.sandboxes.state("sb-abc") # StateSnapshot
ks.sandboxes.diff("sb-abc") # StateDiff
ks.sandboxes.ingest_trace("sb-abc", events) # POST /v1/sandboxes/:id/trace
ks.sandboxes.get_trace("sb-abc") # dictread_file returns bytes (not str) — decode if you know it's text.
SpecService
ks.specs.create(yaml_content) # POST /v1/specs (raw YAML)
ks.specs.get("fix-failing-test")
ks.specs.list()
ks.specs.delete("fix-failing-test")ExperimentService
ks.experiments.create(name, spec_id, *, secrets=None, spec_path=None)
exp = ks.experiments.create(
name="baseline-v1",
spec_id="fix-failing-test",
spec_path="./specs/fix-failing-test.yaml", # auto-forwards declared secrets
secrets={"ANTHROPIC_API_KEY": "..."}, # explicit wins
)Other experiment methods
ks.experiments.run(exp.id) # async — returns immediately
results = ks.experiments.run_and_wait(
exp.id,
poll_interval=1.0, # seconds
timeout=600, # seconds
scores=[Factuality(), AnswerRelevancy()], # client-side scorers
)
# Returns RunResults with full per-scenario detail
ks.experiments.get(exp.id) # current results (partial if running)
ks.experiments.list()
ks.experiments.compare("exp-baseline", "exp-new") # Comparison
ks.experiments.metrics(exp.id) # ExperimentMetricsAlertService
ks.alerts.create({
"name": "pass-rate-drop",
"condition": "pass_rate < 0.8",
"notify": "slack",
"slack_channel": "#agent-alerts",
})
ks.alerts.list()
ks.alerts.delete("alert_abc")AgentService
ks.agents.upload(name, path, entrypoint, *, runtime=None, auth=None, tag=None)
snap = ks.agents.upload(
name="email-agent",
path="./dist/email-agent", # dir, file, or .tar.gz
entrypoint=["python", "main.py"],
runtime="python3.12",
tag="v2.1",
auth={
"required_env": ["ANTHROPIC_API_KEY"],
"config_files": [{"path": ".env", "template": "..."}],
},
)The Python SDK auto-detects the path type:
.tar.gz→ uploaded as-is- Directory → tarred recursively (skipping common junk)
- Single file → wrapped in a one-file tarball
Returns AgentSnapshot with the auto-assigned version.
Other agent methods
ks.agents.get("email-agent") # latest
ks.agents.get("email-agent", tag="v2.1")
ks.agents.get("email-agent", version=3)
ks.agents.get_by_id("snap_abc...") # by content hash
items, next_cursor = ks.agents.list(limit=50, cursor=None)
items, next_cursor = ks.agents.list_versions("email-agent", limit=20)
ks.agents.delete(snapshot) # pass full AgentSnapshotDatasetService
ds = ks.datasets.create("customer-emails", "Renewal scenarios")
ks.datasets.list()
ks.datasets.get("ds_abc")
ks.datasets.delete("ds_abc")
ks.datasets.add_records("ds_abc", [
{"input": {"id": "alice"}, "expected": {"subject": "Renewal"}, "tags": ["pro"]},
])
ks.datasets.get_records("ds_abc", version=3, tags=["pro"])ScoringService
rule = ks.scoring.create_rule("factuality", "llm_as_judge", {
"model": "paragon-fast",
"rubric": {"pass": "...", "fail": "..."},
})
ks.scoring.list_rules()
ks.scoring.delete_rule(rule.id)
ks.scoring.score_experiment("exp-old", [rule.id])
scores = ks.scoring.get_scores("exp-old")ExportService
Each method returns an iterator that pages through cursors automatically:
for event in ks.export.traces(experiment_id="exp-abc"):
print(event)
for span in ks.export.spans(root_span_id="span_xyz"):
print(span)
for scenario in ks.export.scenarios(experiment_id="exp-abc", status="failed"):
print(scenario)
for score in ks.export.scores(experiment_id="exp-abc"):
print(score)
# One-shot bundle
bundle = ks.export.experiment("exp-abc", format="json")
ndjson = ks.export.experiment("exp-abc", format="ndjson")
# Single trace
trace = ks.export.trace("trace-abc")Pagination is transparent — keep iterating, the SDK fetches subsequent pages on demand.
@traced decorator
from polarity_keystone import traced
@traced
def write_config(cfg):
with open("config.json", "w") as f:
json.dump(cfg, f)
@traced(name="custom-name")
async def fetch_data(url):
async with aiohttp.ClientSession() as s:
return await s.get(url)Three forms:
# 1. Decorator (no parens): @traced
@traced
def fn(): ...
# 2. Decorator with name: @traced(name="...")
@traced(name="custom-name")
def fn(): ...
# 3. Context manager: with traced(name="..."):
with traced(name="step") as span:
result = do_work()
span.set_output(result)
# 4. Direct call (TS-parity): traced("name", fn)
result = traced("step", lambda: do_work())
result = await traced("step", async_fn)Works for sync and async functions. Errors propagate; the span is closed with status="error".
OTel bridge
from polarity_keystone import register_otel_flush, flush_otel
from opentelemetry import trace as otel_trace
tracer = otel_trace.get_tracer("my-app")
ks.init_tracing(otel_tracer=tracer)
# Every @traced span is also an OTel span with gen_ai.* attributes.
register_otel_flush(lambda: tracer_provider.shutdown())
flush_otel() # at process exitauto_instrument()
from polarity_keystone import auto_instrument
applied = auto_instrument(sandbox_id="sb-xxx")
# applied: ['openai', 'anthropic', 'mistral', 'litellm', 'langchain', ...]Detects every importable LLM provider in the process and patches each one. Idempotent. Requires a sandbox id (or KEYSTONE_SANDBOX_ID env var); raises RuntimeError otherwise.
Scorers
from polarity_keystone import (
# heuristic (7)
Contains, ExactMatch, Levenshtein, NumericDiff, JSONDiff, JSONValidity, SemanticListContains,
# llm-judge (9)
Factuality, Battle, ClosedQA, Humor, Moderation, Summarization,
SQLJudge, Translation, Security,
# rag (8)
ContextPrecision, ContextRecall, ContextRelevancy, ContextEntityRecall,
Faithfulness, AnswerRelevancy, AnswerSimilarity, AnswerCorrectness,
# embedding (1)
EmbeddingSimilarity, openai_embedder,
# sandbox invariants (5)
FileExists, FileContains, CommandExits, SQLEquals, LLMJudge,
)
from polarity_keystone.scorers import presetsCustom scorer:
from polarity_keystone import Scorer, Score
@Scorer
def word_count_under_100(scenario) -> Score:
words = len((scenario.agent_output or "").split())
return Score(
name="word_count_under_100",
score=1.0 if words < 100 else 0.0,
passed=words < 100,
message=f"{words} words",
)
ks.experiments.run_and_wait(exp.id, scores=[word_count_under_100])Eval primitive
from polarity_keystone import Eval, Factuality, AnswerRelevancy
result = Eval(
"email-eval",
data=[{"input": ..., "expected": ...}, ...],
task=lambda input: my_agent.run(input),
scores=[Factuality(), AnswerRelevancy()],
max_concurrency=4,
)
print(result.summary)
# {factuality: EvalSummary(mean, p50, p95, count), answer_relevancy: ...}Run task per row in parallel, score each, aggregate per scorer. If KEYSTONE_API_KEY is set, also reports to dashboard.
Pricing
from polarity_keystone import estimate_cost, pricing_table
cost = estimate_cost("claude-sonnet-4-5", 4200, 1800, 1500)
pricing_table["my-custom-model"] = {"input": 1.0, "output": 5.0}Error handling
from polarity_keystone.types import KeystoneError
try:
ks.sandboxes.create(spec_id="missing")
except KeystoneError as err:
print(err.status_code, err.message)Dataclasses
Every response shape is a dataclass with a .from_dict() classmethod:
from polarity_keystone.types import (
Sandbox, CommandResult, StateSnapshot, StateDiff,
Experiment, RunResults, ScenarioResult, RunMetrics,
AgentSnapshot, AgentAuth, ConfigFile,
Comparison, ExperimentMetrics, AlertRule,
TraceEvent, CostInfo, InvariantResult, ForbiddenCheckResult,
)Use them for typing in your own code:
from polarity_keystone.types import RunResults
def process_results(results: RunResults) -> None:
print(f"{results.passed}/{results.total_scenarios} passed")
for scenario in results.scenarios:
if scenario.status == "fail":
print(f"FAIL: {scenario.scenario_id}")
for inv in scenario.invariants:
if not inv.passed:
print(f" {inv.name}: {inv.message}")