Go SDK
Every Keystone Go function, type, and option — what each does, when to use it, what it returns.
github.com/Polarityinc/keystone-sdk-go — the Go SDK. Idiomatic Go: context.Context for cancellation, options pattern for configuration, no reflection, zero deps beyond stdlib.
Install
go get github.com/Polarityinc/keystone-sdk-goImport:
import keystone "github.com/Polarityinc/keystone-sdk-go"NewClient(cfg) — the client
client := keystone.NewClient(keystone.Config{
APIKey: "ks_live_...", // optional — falls back to KEYSTONE_API_KEY
BaseURL: "https://keystone.polarity.so", // optional — default
Timeout: 30 * time.Second, // optional — default 30s
})After construction, the nine services hang off the client struct: client.Sandboxes, client.Specs, client.Experiments, client.Alerts, client.Agents, client.Datasets, client.Scoring, client.Export, client.Prompts.
client.Wrap(sandboxID, base) — wrap an HTTP transport for LLM tracing
Go wraps the HTTP transport instead of the LLM client directly:
import (
"net/http"
"github.com/anthropics/anthropic-sdk-go"
"github.com/anthropics/anthropic-sdk-go/option"
)
transport, tc := client.Wrap("", http.DefaultTransport)
// `transport` intercepts /messages and /chat/completions calls and reports traces.
// `tc` is a *TracingContext for tc.Traced() custom spans.
anthropicClient := anthropic.NewClient(option.WithHTTPClient(&http.Client{
Transport: transport,
}))Two ways Wrap resolves its mode:
- Sandbox mode — explicit
sandboxIDarg orKEYSTONE_SANDBOX_IDenv var → events post to/v1/sandboxes/:id/trace. - Agent mode — empty sandbox id but client has an API key → events post to
/v1/traces, scoped by API key. - Neither — returns
baseunchanged (silent no-op for local dev / CI).
For OpenAI:
import "github.com/openai/openai-go"
openaiClient := openai.NewClient(option.WithHTTPClient(&http.Client{
Transport: transport,
}))Any OpenAI-compatible provider (Groq, xAI, Together) works the same way — point the SDK at the right baseURL, then put the wrapped transport in the HTTP client.
client.WrapTransport(sandboxID, base) — transport-only
If you don't want the *TracingContext, just the wrapped transport:
transport := keystone.WrapTransport(client, "", http.DefaultTransport)client.InitTracing(sandboxID) — set up Traced() only
tc := client.InitTracing("") // picks up KEYSTONE_SANDBOX_ID or runs in agent modeReturns a *TracingContext for tc.Traced(ctx, name, fn) calls.
Keystone.FromSandbox(ctx) — inside the sandbox
import keystone "github.com/Polarityinc/keystone-sdk-go"
client, sandbox, err := keystone.FromSandbox(ctx)
if err != nil { return err }
dbInfo := sandbox.Services["db"] // {Host: "db", Port: 5432, Ready: true}Reads KEYSTONE_SANDBOX_ID, KEYSTONE_API_KEY, KEYSTONE_BASE_URL from env. Returns the client + the current sandbox.
SandboxService
client.Sandboxes.Create(ctx, req)
sb, err := client.Sandboxes.Create(ctx, keystone.CreateSandboxRequest{
SpecID: "fix-failing-test",
Timeout: 600, // seconds
Metadata: map[string]string{"run": "ci-7821"},
Secrets: map[string]string{"ANTHROPIC_API_KEY": "..."},
})Returns *Sandbox:
type Sandbox struct {
ID string
SpecID string
State string // "creating" | "ready" | "running" | "stopped" | "error"
Path string
URL string
CreatedAt time.Time
Metadata map[string]string
Services map[string]ServiceInfo
}Other sandbox methods
client.Sandboxes.Get(ctx, "sb-abc") // *Sandbox
client.Sandboxes.List(ctx) // []*Sandbox
client.Sandboxes.Destroy(ctx, "sb-abc")
client.Sandboxes.RunCommand(ctx, "sb-abc", keystone.CommandRequest{
Command: "npm test",
Timeout: 120, // seconds
}) // *CommandResult
client.Sandboxes.ReadFile(ctx, "sb-abc", "src/main.ts") // []byte
client.Sandboxes.WriteFile(ctx, "sb-abc", "config.json", []byte(`{"x":1}`))
client.Sandboxes.DeleteFile(ctx, "sb-abc", "tmp/cache.bin")
client.Sandboxes.State(ctx, "sb-abc") // *StateSnapshot
client.Sandboxes.Diff(ctx, "sb-abc") // *StateDiff
client.Sandboxes.IngestTrace(ctx, "sb-abc", events)
client.Sandboxes.GetTrace(ctx, "sb-abc") // *TraceResponseSpecService
spec, err := client.Specs.Create(ctx, yamlBytes) // POST /v1/specs (raw YAML bytes)
client.Specs.Get(ctx, "fix-failing-test")
client.Specs.List(ctx)
client.Specs.Delete(ctx, "fix-failing-test")ExperimentService
client.Experiments.Create(ctx, req)
exp, err := client.Experiments.Create(ctx, keystone.CreateExperimentRequest{
Name: "baseline-v1",
SpecID: "fix-failing-test",
Secrets: secrets, // see CollectDeclaredSecretsFromFile below
})Auto-forwarding declared secrets:
secrets, err := keystone.CollectDeclaredSecretsFromFile("./specs/fix-failing-test.yaml")
exp, err := client.Experiments.Create(ctx, keystone.CreateExperimentRequest{
Name: "baseline-v1",
SpecID: "fix-failing-test",
Secrets: secrets,
})Other experiment methods
client.Experiments.Run(ctx, exp.ID) // async — returns immediately
results, err := client.Experiments.RunAndWait(ctx, exp.ID, keystone.RunAndWaitOpts{
PollInterval: 2 * time.Second,
Timeout: 5 * time.Minute,
Scores: []keystone.Scorer{ /* client-side scorers */ },
})
client.Experiments.Get(ctx, exp.ID) // *RunResults
client.Experiments.List(ctx) // []*Experiment
client.Experiments.Compare(ctx, "exp-baseline", "exp-new") // *Comparison
client.Experiments.Metrics(ctx, exp.ID) // *ExperimentMetricsRunAndWait retries 5xx responses transparently and aborts on 4xx. With Scores, it runs each client-side scorer over each completed scenario and merges results into Invariants before returning.
AlertService
alert, err := client.Alerts.Create(ctx, keystone.CreateAlertRequest{
Name: "pass-rate-drop",
EvalID: "fix-failing-test",
Condition: "pass_rate < 0.8",
Notify: "slack",
SlackChannel: "#agent-alerts",
})
client.Alerts.Get(ctx, "alert_abc")
client.Alerts.List(ctx)
client.Alerts.Delete(ctx, "alert_abc")AgentService
client.Agents.Upload(ctx, req, bundle)
import "os"
bundle, err := os.Open("dist/email-agent.tar.gz")
if err != nil { return err }
defer bundle.Close()
snap, err := client.Agents.Upload(ctx, keystone.UploadSnapshotRequest{
Name: "email-agent",
Tag: "v2.1",
Runtime: "python3.12",
Entrypoint: []string{"python", "main.py"},
Auth: &keystone.AgentAuth{
RequiredEnv: []string{"ANTHROPIC_API_KEY"},
},
}, bundle)Returns *AgentSnapshot with auto-assigned version.
Other agent methods
// Functional options pattern
client.Agents.Get(ctx, "email-agent") // latest
client.Agents.Get(ctx, "email-agent", keystone.WithTag("v2.1"))
client.Agents.Get(ctx, "email-agent", keystone.WithVersion(3))
client.Agents.GetByID(ctx, "snap_abc...") // by content hash
page, err := client.Agents.List(ctx, keystone.WithLimit(50)) // *AgentPage
page, err := client.Agents.List(ctx, keystone.WithCursor(prev.NextCursor))
page, err := client.Agents.ListVersions(ctx, "email-agent", keystone.WithLimit(20))
client.Agents.Delete(ctx, snap) // pass full *AgentSnapshotDatasetService
ds, err := client.Datasets.Create(ctx, "customer-emails", "Renewal scenarios")
client.Datasets.List(ctx)
client.Datasets.Get(ctx, "ds_abc")
client.Datasets.Delete(ctx, "ds_abc")
client.Datasets.AddRecords(ctx, "ds_abc", []keystone.DatasetRecord{
{Input: map[string]any{"id": "alice"}, Expected: map[string]any{"subject": "Renewal"}, Tags: []string{"pro"}},
})
records, err := client.Datasets.GetRecords(ctx, "ds_abc",
keystone.WithRecordVersion(3),
keystone.WithRecordTags("pro"),
)ExportService
Each method returns a channel for streaming pagination:
ch := client.Export.Traces(ctx, keystone.TraceFilter{
ExperimentID: "exp-abc",
Tool: "write_file",
}, 100)
for event := range ch {
fmt.Println(event)
}
ch := client.Export.Spans(ctx, keystone.SpanFilter{RootSpanID: "span_xyz"}, 100)
ch := client.Export.Scenarios(ctx, keystone.ScenarioFilter{ExperimentID: "exp-abc", Status: "failed"}, 100)
ch := client.Export.Scores(ctx, keystone.ScoreFilter{ExperimentID: "exp-abc"}, 100)
// One-shot bundle:
data, err := client.Export.Experiment(ctx, "exp-abc", keystone.ExportJSON)
// or:
data, err := client.Export.Experiment(ctx, "exp-abc", keystone.ExportNDJSON)
// Single trace:
trace, err := client.Export.Trace(ctx, "trace-abc")ctx cancellation closes the channel — the consumer gets an early-stop signal cleanly.
TracingContext — custom spans
tc := client.InitTracing("") // sandbox or agent mode
err := tc.Traced(ctx, "write_file", func() error {
return os.WriteFile(path, content, 0644)
})
// With a return value:
result, err := keystone.TracedValue(tc, ctx, "compute", func() (Result, error) {
return doWork()
})Nested Traced calls form a parent-child span tree (state held in tc.currentSpanID with a mutex for goroutine safety).
OTel bridge
import (
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/sdk/trace"
)
tp := trace.NewTracerProvider(/* ... */)
otel.SetTracerProvider(tp)
keystone.RegisterOtelFlush(func(ctx context.Context) error {
return tp.Shutdown(ctx)
})
// At shutdown:
keystone.FlushOtel(ctx)Scorers
Each scorer implements keystone.Scorer:
type Scorer interface {
Name() string
Score(scenario ScenarioResult) *Score
ToInvariant() Invariant
}Built-ins:
// heuristic
keystone.Contains{Text: "renewal"}
keystone.ExactMatch{Expected: "yes"}
keystone.Levenshtein{Expected: "expected", Threshold: 0.85}
keystone.NumericDiff{Expected: 42, Tolerance: 0.01}
keystone.JSONDiff{Expected: ..., Threshold: 0.9}
keystone.JSONValidity{}
// llm-judge
keystone.Factuality{Model: "paragon-fast"}
keystone.Moderation{}
keystone.Summarization{}
// sandbox
keystone.FileExists{Path: "output.json"}
keystone.FileContains{Path: "src/main.ts", Contains: "TODO"}
keystone.CommandExits{Command: "npm test", ExitCode: 0}
keystone.SQLEquals{Service: "db", Query: "SELECT count(*)", Equals: 5}
keystone.LLMJudge{Criteria: "...", Model: "paragon-fast"}Custom scorer:
type WordCount struct{}
func (s WordCount) Name() string { return "word_count_under_100" }
func (s WordCount) Score(scenario keystone.ScenarioResult) *keystone.Score {
words := len(strings.Fields(scenario.AgentOutput))
return &keystone.Score{
Name: s.Name(),
Score: boolToFloat(words < 100),
Passed: words < 100,
Message: fmt.Sprintf("%d words", words),
}
}
results, err := client.Experiments.RunAndWait(ctx, exp.ID, keystone.RunAndWaitOpts{
Scores: []keystone.Scorer{WordCount{}},
})Eval primitive
result, err := keystone.Eval(ctx, "email-eval", keystone.EvalConfig{
Data: []keystone.EvalRow{
{Input: ..., Expected: ...},
},
Task: func(ctx context.Context, input any) (any, error) {
return myAgent.Run(ctx, input)
},
Scores: []keystone.Scorer{keystone.Factuality{}, keystone.AnswerRelevancy{}},
MaxConcurrency: 4,
Keystone: client,
})
fmt.Println(result.Summary)
// map[factuality:{Mean:0.93 P50:1 P95:1 Count:50} answer_relevancy:{...}]Runs Task per row with bounded concurrency, scores each, aggregates per scorer. If KEYSTONE_API_KEY is set, also reports to dashboard.
Pricing
cost := keystone.EstimateCost("claude-sonnet-4-5", 4200, 1800, 1500)
// 0.0405 USDError handling
import "errors"
_, err := client.Sandboxes.Create(ctx, ...)
if err != nil {
var apiErr *keystone.APIError
if errors.As(err, &apiErr) {
fmt.Println(apiErr.StatusCode, apiErr.Message)
}
}Types
Every shape lives in types.go. The most useful:
type Sandbox struct { ID, SpecID, State, Path, URL string; CreatedAt time.Time; Metadata map[string]string; Services map[string]ServiceInfo }
type Experiment struct { ID, Name, SpecID, Status string; CreatedAt time.Time }
type RunResults struct { TotalScenarios, Passed, Failed, Errors int; Metrics RunMetrics; Scenarios []ScenarioResult }
type ScenarioResult struct { ScenarioID, SandboxID, Status string; CompositeScore float64; Invariants []InvariantResult; ... }
type AgentSnapshot struct { ID, Name string; Version int; Tag, Digest, StoragePath, Runtime string; Entrypoint []string; Auth *AgentAuth }
type TraceEvent struct { Timestamp time.Time; EventType, ToolName, Phase, Status string; DurationMs int64; SpanID, ParentSpanID string; Cost *CostInfo }Context cancellation
Every method takes context.Context and respects cancellation. Cancel a long-running RunAndWait by canceling its context:
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
results, err := client.Experiments.RunAndWait(ctx, exp.ID, keystone.RunAndWaitOpts{})
if errors.Is(err, context.DeadlineExceeded) {
log.Println("experiment took too long")
}