Skip to content

ZeptoDB Kubernetes Operations Guide

Last updated: 2026-03-24


  1. Architecture Overview
  2. Initial Deployment
  3. Day-2 Operations
  4. Monitoring & Alerting
  5. Scaling
  6. Backup & Recovery
  7. Upgrades & Rollback
  8. Security
  9. Cluster Mode
  10. Troubleshooting
  11. Runbooks

See also: Failure Scenarios & Recovery Guide — Automatic/manual recovery procedures for 8 failure scenarios


┌─────────────────────────────────────────────────────────────┐
│ Kubernetes Cluster │
│ │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ Namespace: zeptodb │ │
│ │ │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
│ │ │ Pod-0 │ │ Pod-1 │ │ Pod-2 │ ← Deployment│ │
│ │ │ ZeptoDB │ │ ZeptoDB │ │ ZeptoDB │ (3 replicas)│ │
│ │ │ :8123 │ │ :8123 │ │ :8123 │ │ │
│ │ └────┬────┘ └────┬────┘ └────┬────┘ │ │
│ │ │ │ │ │ │
│ │ ┌────┴─────────────┴────────────┴────┐ │ │
│ │ │ Service (LoadBalancer :8123) │ │ │
│ │ │ + Headless Service (pod discovery) │ │ │
│ │ └────────────────────────────────────┘ │ │
│ │ │ │
│ │ ConfigMap │ PVC (gp3 500Gi) │ PDB │ HPA │ ServiceMon│ │
│ └──────────────────────────────────────────────────────┘ │
│ │
│ ┌──────────────────────┐ ┌─────────────────────────────┐ │
│ │ Prometheus │ │ Grafana │ │
│ │ ServiceMonitor 15s │ │ Dashboard + 9 Alert Rules │ │
│ └──────────────────────┘ └─────────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
ResourceTemplatePurpose
Deploymentdeployment.yamlZeptoDB pods (rolling update)
Serviceservice.yamlLoadBalancer + Headless
ConfigMapconfigmap.yamlzeptodb.conf
PVCpvc.yamlgp3 500Gi persistent storage
HPAhpa.yamlAuto-scaling (3–10 replicas)
PDBpdb.yamlminAvailable: 2
ServiceMonitorservicemonitor.yamlPrometheus scrape config

Terminal window
# Required
kubectl version --client # 1.26+
helm version # 3.x
# Verify cluster access
kubectl cluster-info
kubectl get nodes
Terminal window
# Create namespace
kubectl create namespace zeptodb
# Install
helm install zeptodb ./deploy/helm/zeptodb \
-n zeptodb \
--set image.repository=your-registry/zeptodb \
--set image.tag=1.0.0
# Verify
kubectl get all -n zeptodb

values-prod.yaml:

replicaCount: 3
image:
repository: your-registry/zeptodb
tag: "1.0.0"
resources:
requests:
cpu: "4"
memory: "16Gi"
limits:
cpu: "8"
memory: "32Gi"
persistence:
storageClass: gp3
size: 500Gi
config:
workerThreads: 8
parallelThreshold: 100000
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 10
podDisruptionBudget:
enabled: true
minAvailable: 2
# Graviton (ARM) nodes
nodeSelector:
kubernetes.io/arch: arm64
# or for x86:
# kubernetes.io/arch: amd64
Terminal window
helm install zeptodb ./deploy/helm/zeptodb \
-n zeptodb \
-f values-prod.yaml \
--wait --timeout 5m
Terminal window
# All pods running
kubectl get pods -n zeptodb -o wide
# Health check
export LB=$(kubectl get svc zeptodb -n zeptodb \
-o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
curl -s http://$LB:8123/health
curl -s http://$LB:8123/ready
# Test query
curl -X POST http://$LB:8123/ -d 'SELECT 1'

#!/bin/bash
# daily-check.sh — run from cron or manually
NS=zeptodb
LB=$(kubectl get svc zeptodb -n $NS \
-o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
echo "=== Pod Status ==="
kubectl get pods -n $NS -o wide
echo "=== Health ==="
curl -sf http://$LB:8123/health && echo " OK" || echo " FAIL"
echo "=== Readiness ==="
curl -sf http://$LB:8123/ready && echo " OK" || echo " FAIL"
echo "=== HPA ==="
kubectl get hpa -n $NS
echo "=== PVC ==="
kubectl get pvc -n $NS
echo "=== Recent Events ==="
kubectl get events -n $NS --sort-by='.lastTimestamp' | tail -10

When a ConfigMap is changed, the checksum/config annotation automatically triggers a rollout.

Terminal window
# Change worker threads
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set config.workerThreads=16 \
--wait
# Change multiple settings at once
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
-f values-prod.yaml \
--set config.workerThreads=16 \
--set config.queryCacheSize=2000 \
--wait
Terminal window
# Logs for a specific pod
kubectl logs -f <pod-name> -n zeptodb
# Logs for all pods (stern recommended)
stern zeptodb -n zeptodb
# Previous crash logs
kubectl logs <pod-name> -n zeptodb --previous
# Logs since a specific time
kubectl logs <pod-name> -n zeptodb --since=1h
Terminal window
# Full rolling restart (zero-downtime)
kubectl rollout restart deployment/zeptodb -n zeptodb
# Delete a specific pod only (Deployment auto-recreates it)
kubectl delete pod <pod-name> -n zeptodb

Terminal window
# Enable ServiceMonitor (requires Prometheus Operator)
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set serviceMonitor.enabled=true \
--set serviceMonitor.interval=15s

In environments without ServiceMonitor, use Pod annotation-based scraping:

# Already included in deployment.yaml
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8123"
prometheus.io/path: "/metrics"
Terminal window
# Check directly
curl -s http://$LB:8123/metrics
MetricTypeAlert Threshold
zepto_server_upgauge== 0 → critical
zepto_server_readygauge== 0 for 5m → warning
zepto_ticks_ingested_totalcounterrate < 1000/s → warning
zepto_ticks_dropped_totalcounterrate > 1000/s → warning
zepto_queries_executed_totalcounterrate > 100/s → info
zepto_rows_scanned_totalcounterrate > 10M/s → warning

Defined in monitoring/zeptodb-alerts.yml:

AlertSeverityCondition
ApexDBDowncriticalzepto_server_up == 0 for 1m
ApexDBNotReadywarningzepto_server_ready == 0 for 5m
HighTickDropRatewarningdrop rate > 1000/s for 2m
HighQueryRateinfoquery rate > 100/s for 5m
HighRowScanRatewarningscan rate > 10M/s for 5m
LowIngestionRatewarningingestion < 1000/s for 10m
HighDiskUsagewarningdisk < 20% free for 5m
HighMemoryUsagewarningmemory < 10% free for 5m
HighCPUUsagewarningCPU > 90% for 10m
Terminal window
# Import dashboard
kubectl create configmap grafana-zeptodb \
-n monitoring \
--from-file=monitoring/grafana-dashboard.json
# Or import via Grafana UI → Import → monitoring/grafana-dashboard.json

Grafana can connect directly as a ClickHouse data source (port 8123, ClickHouse compatible API).


See EKS Cluster Requirements for full cluster setup including K8s version, Auto Mode, and custom NodePool configuration.

EKS Auto Mode includes built-in Karpenter — no separate install needed. Nodes are provisioned via EC2 Fleet API when pods are pending.

Terminal window
# Check node pools (built-in + custom)
kubectl get nodepools
kubectl get nodeclasses
# Check node claims (active nodes)
kubectl get nodeclaims
# Monitor scaling events
kubectl describe nodepool zepto-realtime
kubectl describe nodepool zepto-analytics

Two custom node pools are configured:

PoolTriggerCapacityConsolidation
zepto-realtimePending pods with zeptodb.com/role: realtimeOn-Demand onlyWhenEmpty, after 30m
zepto-analyticsPending pods with zeptodb.com/role: analyticsSpot + On-DemandWhenEmptyOrUnderutilized, after 5m

Scaling flow: HPA increases replicas → pods pending → Auto Mode provisions node (30-60s) → pods scheduled.

Default configuration: Auto-scales between 3–10 replicas based on CPU 70% / Memory 80% thresholds.

Terminal window
# Check HPA status
kubectl get hpa -n zeptodb
kubectl describe hpa zeptodb -n zeptodb
# Manual scale
kubectl scale deployment zeptodb -n zeptodb --replicas=5
# Change HPA settings
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set autoscaling.minReplicas=5 \
--set autoscaling.maxReplicas=20 \
--set autoscaling.targetCPU=60
# Already configured in values.yaml
autoscaling:
scaleDown:
stabilizationSeconds: 300 # Scale down after 5-minute stabilization
scaleUp:
stabilizationSeconds: 60 # Scale up after 1-minute stabilization
Terminal window
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set resources.requests.cpu=8 \
--set resources.requests.memory=32Gi \
--set resources.limits.cpu=16 \
--set resources.limits.memory=64Gi \
--wait
Terminal window
# Graviton (ARM) nodes
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set nodeSelector."kubernetes\.io/arch"=arm64
# Dedicated instance type
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set nodeSelector."node\.kubernetes\.io/instance-type"=c7g.4xlarge

deploy/k8s/backup-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: zeptodb-backup
namespace: zeptodb
spec:
schedule: "0 2 * * *" # Daily at 02:00 UTC
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: backup
image: amazon/aws-cli:latest
env:
- name: S3_BUCKET
value: "your-zeptodb-backups"
- name: DATA_DIR
value: "/opt/zeptodb/data"
command:
- /bin/sh
- -c
- |
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
tar -czf /tmp/zeptodb-${TIMESTAMP}.tar.gz -C ${DATA_DIR} .
aws s3 cp /tmp/zeptodb-${TIMESTAMP}.tar.gz \
s3://${S3_BUCKET}/backups/zeptodb-${TIMESTAMP}.tar.gz \
--storage-class STANDARD_IA
echo "Backup completed: zeptodb-${TIMESTAMP}.tar.gz"
volumeMounts:
- name: data
mountPath: /opt/zeptodb/data
readOnly: true
volumes:
- name: data
persistentVolumeClaim:
claimName: zeptodb-data
Terminal window
kubectl apply -f deploy/k8s/backup-cronjob.yaml
# Trigger manual backup
kubectl create job --from=cronjob/zeptodb-backup zeptodb-backup-manual -n zeptodb
# Check backup status
kubectl get jobs -n zeptodb
kubectl logs job/zeptodb-backup-manual -n zeptodb
Terminal window
# VolumeSnapshot (requires CSI driver)
cat <<EOF | kubectl apply -f -
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
name: zeptodb-snap-$(date +%Y%m%d)
namespace: zeptodb
spec:
volumeSnapshotClassName: ebs-csi-snapclass
source:
persistentVolumeClaimName: zeptodb-data
EOF
# Verify snapshot
kubectl get volumesnapshot -n zeptodb
Terminal window
# Create new PVC from snapshot
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: zeptodb-data-restored
namespace: zeptodb
spec:
accessModes: [ReadWriteOnce]
storageClassName: gp3
resources:
requests:
storage: 500Gi
dataSource:
name: zeptodb-snap-20260324
kind: VolumeSnapshot
apiGroup: snapshot.storage.k8s.io
EOF
# Replace PVC in Deployment
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set persistence.existingClaim=zeptodb-data-restored \
--wait

For details: Rolling Upgrade Guide

Terminal window
# 1. Pre-flight
kubectl get pods -n zeptodb -o wide
curl -s http://$LB:8123/health
# 2. Upgrade
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set image.tag=1.1.0 \
--wait --timeout 5m
# 3. Monitor
kubectl rollout status deployment/zeptodb -n zeptodb
# 4. Verify
curl -s http://$LB:8123/health
curl -X POST http://$LB:8123/ -d 'SELECT 1'
SettingValueEffect
maxSurge1Create 1 new pod first
maxUnavailable0Maintain existing pod count
PDB minAvailable2Guarantee minimum 2 pods
preStop sleep15sWait for in-flight queries to complete
readinessProbe/readyOnly ready pods receive traffic
Terminal window
# Immediate rollback
helm rollback zeptodb -n zeptodb
# Rollback to a specific revision
helm history zeptodb -n zeptodb
helm rollback zeptodb <REVISION> -n zeptodb
# kubectl rollback (without Helm)
kubectl rollout undo deployment/zeptodb -n zeptodb
Terminal window
# 1. Canary deployment (1 replica)
helm install zeptodb-canary ./deploy/helm/zeptodb -n zeptodb \
--set replicaCount=1 \
--set image.tag=2.0.0 \
--set service.type=ClusterIP \
--set autoscaling.enabled=false \
--set podDisruptionBudget.enabled=false
# 2. Canary testing
kubectl port-forward svc/zeptodb-canary 8124:8123 -n zeptodb
curl -X POST http://localhost:8124/ -d 'SELECT vwap(price, volume) FROM trades WHERE symbol = 1'
# 3a. Success → promote
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb --set image.tag=2.0.0 --wait
helm uninstall zeptodb-canary -n zeptodb
# 3b. Failure → remove
helm uninstall zeptodb-canary -n zeptodb

Terminal window
# Create TLS Secret
kubectl create secret tls zeptodb-tls \
-n zeptodb \
--cert=/path/to/cert.pem \
--key=/path/to/key.pem
# Ingress with TLS
cat <<EOF | kubectl apply -f -
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: zeptodb
namespace: zeptodb
annotations:
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
spec:
tls:
- hosts:
- zeptodb.example.com
secretName: zeptodb-tls
rules:
- host: zeptodb.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: zeptodb
port:
number: 8123
EOF
Terminal window
# API keys file
kubectl create secret generic zeptodb-auth \
-n zeptodb \
--from-file=keys.txt=/path/to/keys.txt
# JWT secret
kubectl create secret generic zeptodb-jwt \
-n zeptodb \
--from-literal=JWT_SECRET='your-jwt-secret'
# Vault integration (Secrets Store CSI)
# → SecretsProvider chain: Vault KV v2 → K8s file → env var
# Allow access only from same namespace + monitoring
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: zeptodb-netpol
namespace: zeptodb
spec:
podSelector:
matchLabels:
app.kubernetes.io/name: zeptodb
policyTypes: [Ingress]
ingress:
- from:
- namespaceSelector:
matchLabels:
name: zeptodb
- namespaceSelector:
matchLabels:
name: monitoring
ports:
- port: 8123
protocol: TCP
# Role for operators
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: zeptodb-operator
namespace: zeptodb
rules:
- apiGroups: ["", "apps", "autoscaling"]
resources: ["pods", "deployments", "services", "configmaps", "hpa"]
verbs: ["get", "list", "watch", "update", "patch"]
- apiGroups: [""]
resources: ["pods/log", "pods/exec"]
verbs: ["get", "create"]

For operating a ZeptoDB distributed cluster on Kubernetes.

Terminal window
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set cluster.enabled=true \
--set cluster.rpcPortOffset=100 \
--set cluster.heartbeatPort=9100 \
--set headless.enabled=true

Direct pod-to-pod communication via Headless Service:

  • RPC: <pod-name>.zeptodb-headless.zeptodb.svc:8223
  • Heartbeat: UDP :9100
Terminal window
# Check cluster status for each pod
for pod in $(kubectl get pods -n zeptodb -l app.kubernetes.io/name=zeptodb -o name); do
echo "--- $pod ---"
kubectl exec -n zeptodb $pod -- curl -s http://localhost:8123/health
echo
done
  • CoordinatorHA handles automatic re-registration
  • FencingToken prevents split-brain
  • Increase gracefulShutdown time during upgrades to ensure WAL flush
Terminal window
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set image.tag=1.1.0 \
--set gracefulShutdown.preStopSleepSeconds=30 \
--set gracefulShutdown.terminationGracePeriodSeconds=60 \
--wait --timeout 10m

Terminal window
# Check status
kubectl describe pod <pod> -n zeptodb
# Common causes:
# - ImagePullBackOff → Check image path/authentication
# - Pending → Insufficient resources (kubectl describe node)
# - CrashLoopBackOff → Check logs (kubectl logs --previous)
Terminal window
kubectl logs <pod> -n zeptodb | grep -i "error\|fail\|ready"
# Check directly from inside the pod
kubectl exec -n zeptodb <pod> -- curl -s http://localhost:8123/ready
Terminal window
kubectl describe pvc zeptodb-data -n zeptodb
# Check StorageClass
kubectl get sc
# If gp3 StorageClass does not exist, it needs to be created
Terminal window
# Check memory usage
kubectl top pods -n zeptodb
# Increase limits
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set resources.limits.memory=64Gi \
--wait
Terminal window
# Check query plan with EXPLAIN
curl -X POST http://$LB:8123/ -d 'EXPLAIN SELECT ...'
# Check running queries via Admin API
curl -H "Authorization: Bearer $ADMIN_KEY" http://$LB:8123/admin/queries
# Kill slow query
curl -X DELETE -H "Authorization: Bearer $ADMIN_KEY" \
http://$LB:8123/admin/queries/<query-id>
Terminal window
kubectl describe hpa zeptodb -n zeptodb
# Check metrics-server
kubectl top pods -n zeptodb
# "error: Metrics API not available" → metrics-server needs to be installed

Terminal window
# 1. Record current state
kubectl get pods -n zeptodb -o wide > /tmp/zeptodb-state.txt
# 2. Rolling restart (zero-downtime)
kubectl rollout restart deployment/zeptodb -n zeptodb
kubectl rollout status deployment/zeptodb -n zeptodb --timeout=5m
# 3. Verify
curl -s http://$LB:8123/health
curl -X POST http://$LB:8123/ -d 'SELECT 1'
Terminal window
# 1. Check
kubectl exec -n zeptodb <pod> -- df -h /opt/zeptodb/data
# 2. Clean up old HDB data (TTL setting)
curl -X POST http://$LB:8123/ \
-d "ALTER TABLE trades SET TTL 90 DAYS"
# 3. Expand PVC (if StorageClass has allowVolumeExpansion: true)
kubectl patch pvc zeptodb-data -n zeptodb \
-p '{"spec":{"resources":{"requests":{"storage":"1Ti"}}}}'
Terminal window
# PDB guarantees minAvailable: 2, so drain is safe
kubectl drain <node> --ignore-daemonsets --delete-emptydir-data
# After maintenance is complete
kubectl uncordon <node>
Terminal window
# 1. Backup
kubectl create job --from=cronjob/zeptodb-backup zeptodb-pre-redeploy -n zeptodb
kubectl wait --for=condition=complete job/zeptodb-pre-redeploy -n zeptodb --timeout=10m
# 2. Delete
helm uninstall zeptodb -n zeptodb
# PVC is preserved (not deleted by helm uninstall)
# 3. Redeploy
helm install zeptodb ./deploy/helm/zeptodb -n zeptodb -f values-prod.yaml --wait
# 4. Verify
curl -s http://$LB:8123/health

Terminal window
# === Status ===
kubectl get all -n zeptodb
kubectl get hpa -n zeptodb
kubectl get pvc -n zeptodb
kubectl get events -n zeptodb --sort-by='.lastTimestamp' | tail -20
# === Logs ===
kubectl logs -f deployment/zeptodb -n zeptodb
kubectl logs <pod> -n zeptodb --previous
# === Health ===
curl http://$LB:8123/health
curl http://$LB:8123/ready
curl http://$LB:8123/metrics
# === Helm ===
helm list -n zeptodb
helm history zeptodb -n zeptodb
helm get values zeptodb -n zeptodb
# === Upgrade ===
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb --set image.tag=X.Y.Z --wait
helm rollback zeptodb -n zeptodb
# === Scale ===
kubectl scale deployment zeptodb -n zeptodb --replicas=5
kubectl top pods -n zeptodb