Kubernetes Operator for OSSA Agent Lifecycle Management
Objective
Build a Kubernetes Operator that enables declarative management of OSSA agents as Custom Resources, handling deployment, scaling, and lifecycle management.
Scope
Create a Kubernetes operator that:
- Defines
Agent
CRD based on OSSA 1.0 schema - Watches for Agent resource changes
- Deploys agents as Pods/Deployments
- Manages agent dependencies and ordering
- Handles health checks and restarts
- Supports horizontal pod autoscaling
Technical Approach
Custom Resource Definition
# config/crd/ossa.ai_agents.yaml
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: agents.ossa.ai
spec:
group: ossa.ai
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
required:
- ossaVersion
- agent
properties:
ossaVersion:
type: string
pattern: ^1\.0$
agent:
type: object
required:
- id
- name
- version
- role
- runtime
- capabilities
properties:
id:
type: string
pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
name:
type: string
version:
type: string
role:
type: string
enum:
- compliance
- chat
- orchestration
- audit
- workflow
- monitoring
runtime:
type: object
properties:
type:
type: string
enum: [docker, k8s, local, serverless]
image:
type: string
resources:
type: object
properties:
cpu:
type: string
memory:
type: string
capabilities:
type: array
items:
type: object
scope: Namespaced
names:
plural: agents
singular: agent
kind: Agent
shortNames:
- agt
Operator Implementation
// controllers/agent_controller.go
package controllers
import (
"context"
"fmt"
ossav1 "github.com/ossa-ai/operator/api/v1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
type AgentReconciler struct {
client.Client
Scheme *runtime.Scheme
}
func (r *AgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := ctrl.LoggerFrom(ctx)
// Fetch Agent resource
var agent ossav1.Agent
if err := r.Get(ctx, req.NamespacedName, &agent); err != nil {
if errors.IsNotFound(err) {
return ctrl.Result{}, nil
}
return ctrl.Result{}, err
}
// Create or update Deployment
deployment := r.buildDeployment(&agent)
if err := r.createOrUpdate(ctx, deployment); err != nil {
log.Error(err, "Failed to create/update Deployment")
return ctrl.Result{}, err
}
// Create or update Service
service := r.buildService(&agent)
if err := r.createOrUpdate(ctx, service); err != nil {
log.Error(err, "Failed to create/update Service")
return ctrl.Result{}, err
}
// Update status
agent.Status.Phase = "Running"
agent.Status.Replicas = deployment.Status.ReadyReplicas
if err := r.Status().Update(ctx, &agent); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
func (r *AgentReconciler) buildDeployment(agent *ossav1.Agent) *appsv1.Deployment {
labels := map[string]string{
"app": agent.Spec.Agent.ID,
"ossa.ai/agent": agent.Spec.Agent.ID,
"ossa.ai/role": agent.Spec.Agent.Role,
"ossa.ai/version": agent.Spec.Agent.Version,
}
replicas := int32(1)
if agent.Spec.Agent.Runtime.Replicas != nil {
replicas = *agent.Spec.Agent.Runtime.Replicas
}
return &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: agent.Name,
Namespace: agent.Namespace,
Labels: labels,
},
Spec: appsv1.DeploymentSpec{
Replicas: &replicas,
Selector: &metav1.LabelSelector{
MatchLabels: labels,
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: labels,
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: agent.Spec.Agent.ID,
Image: agent.Spec.Agent.Runtime.Image,
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: parseQuantity(agent.Spec.Agent.Runtime.Resources.CPU),
corev1.ResourceMemory: parseQuantity(agent.Spec.Agent.Runtime.Resources.Memory),
},
},
Env: []corev1.EnvVar{
{
Name: "OSSA_AGENT_ID",
Value: agent.Spec.Agent.ID,
},
{
Name: "OSSA_AGENT_VERSION",
Value: agent.Spec.Agent.Version,
},
},
LivenessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromInt(8080),
},
},
},
},
},
},
},
},
}
}
func (r *AgentReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&ossav1.Agent{}).
Owns(&appsv1.Deployment{}).
Owns(&corev1.Service{}).
Complete(r)
}
Usage Example
# config/samples/ossa_v1_agent.yaml
apiVersion: ossa.ai/v1
kind: Agent
metadata:
name: compliance-scanner
namespace: agents
spec:
ossaVersion: "1.0"
agent:
id: compliance-scanner
name: Compliance Scanner Agent
version: 1.2.0
role: compliance
runtime:
type: k8s
image: registry.example.com/compliance-scanner:1.2.0
resources:
cpu: 500m
memory: 512Mi
replicas: 3
capabilities:
- name: scan_codebase
description: Scan codebase for compliance violations
input_schema:
type: object
properties:
repository:
type: string
output_schema:
type: object
properties:
violations:
type: array
Acceptance Criteria
-
CRD matches OSSA 1.0 schema structure -
Operator reconciles Agent resources into Deployments -
Service creation for agent endpoints -
ConfigMap generation from capabilities -
Dependency management (wait for dependent agents) -
Status updates reflect actual agent state -
Support for HPA based on agent metrics -
Helm chart for operator installation -
RBAC configuration -
Comprehensive operator tests
Files to Create
-
config/crd/ossa.ai_agents.yaml
- CRD definition -
controllers/agent_controller.go
- Reconciler -
api/v1/agent_types.go
- Go types -
config/samples/
- Example manifests -
helm/ossa-operator/
- Helm chart
Similar Projects
- Prometheus Operator
- Cert Manager
- Argo CD ApplicationSet Controller