Skip to content

ZeptoDB Kubernetes Operations Guide

Last updated: 2026-04-30


  1. Architecture Overview
  2. Initial Deployment
  3. Day-2 Operations
  4. Monitoring & Alerting
  5. Scaling
  6. Backup & Recovery
  7. Upgrades & Rollback
  8. Security
  9. Cluster Mode
  10. Troubleshooting
  11. Runbooks

See also: Failure Scenarios & Recovery Guide — Automatic/manual recovery procedures for 8 failure scenarios


┌─────────────────────────────────────────────────────────────┐
│ Kubernetes Cluster │
│ │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ Namespace: zeptodb │ │
│ │ │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
│ │ │ Pod-0 │ │ Pod-1 │ │ Pod-2 │ ← Deployment│ │
│ │ │ ZeptoDB │ │ ZeptoDB │ │ ZeptoDB │ (3 replicas)│ │
│ │ │ :8123 │ │ :8123 │ │ :8123 │ │ │
│ │ └────┬────┘ └────┬────┘ └────┬────┘ │ │
│ │ │ │ │ │ │
│ │ ┌────┴─────────────┴────────────┴────┐ │ │
│ │ │ Service (LoadBalancer :8123) │ │ │
│ │ │ + Headless Service (pod discovery) │ │ │
│ │ └────────────────────────────────────┘ │ │
│ │ │ │
│ │ ConfigMap │ PVC (gp3 500Gi) │ PDB │ HPA │ ServiceMon│ │
│ └──────────────────────────────────────────────────────┘ │
│ │
│ ┌──────────────────────┐ ┌─────────────────────────────┐ │
│ │ Prometheus │ │ Grafana │ │
│ │ ServiceMonitor 15s │ │ Dashboard + 9 Alert Rules │ │
│ └──────────────────────┘ └─────────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
ResourceTemplatePurpose
Deploymentdeployment.yamlZeptoDB pods (rolling update)
Serviceservice.yamlLoadBalancer + Headless
ConfigMapconfigmap.yamlzeptodb.conf
PVCpvc.yamlgp3 500Gi persistent storage
HPAhpa.yamlAuto-scaling (3–10 replicas)
PDBpdb.yamlminAvailable: 2
ServiceMonitorservicemonitor.yamlPrometheus scrape config

Terminal window
# Required
kubectl version --client # 1.26+
helm version # 3.x
# Verify cluster access
kubectl cluster-info
kubectl get nodes
Terminal window
# Create namespace
kubectl create namespace zeptodb
# Install
helm install zeptodb ./deploy/helm/zeptodb \
-n zeptodb \
--set image.repository=your-registry/zeptodb \
--set image.tag=1.0.0
# Verify
kubectl get all -n zeptodb

values-prod.yaml:

replicaCount: 3
image:
repository: your-registry/zeptodb
tag: "1.0.0"
resources:
requests:
cpu: "4"
memory: "16Gi"
limits:
cpu: "8"
memory: "32Gi"
persistence:
storageClass: gp3
size: 500Gi
config:
workerThreads: 8
parallelThreshold: 100000
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 10
podDisruptionBudget:
enabled: true
minAvailable: 2
# Graviton (ARM) nodes
nodeSelector:
kubernetes.io/arch: arm64
# or for x86:
# kubernetes.io/arch: amd64
Terminal window
helm install zeptodb ./deploy/helm/zeptodb \
-n zeptodb \
-f values-prod.yaml \
--wait --timeout 5m
Terminal window
# All pods running
kubectl get pods -n zeptodb -o wide
# Health check
export LB=$(kubectl get svc zeptodb -n zeptodb \
-o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
curl -s http://$LB:8123/health
curl -s http://$LB:8123/ready
# Test query
curl -X POST http://$LB:8123/ -d 'SELECT 1'

#!/bin/bash
# daily-check.sh — run from cron or manually
NS=zeptodb
LB=$(kubectl get svc zeptodb -n $NS \
-o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
echo "=== Pod Status ==="
kubectl get pods -n $NS -o wide
echo "=== Health ==="
curl -sf http://$LB:8123/health && echo " OK" || echo " FAIL"
echo "=== Readiness ==="
curl -sf http://$LB:8123/ready && echo " OK" || echo " FAIL"
echo "=== HPA ==="
kubectl get hpa -n $NS
echo "=== PVC ==="
kubectl get pvc -n $NS
echo "=== Recent Events ==="
kubectl get events -n $NS --sort-by='.lastTimestamp' | tail -10

When a ConfigMap is changed, the checksum/config annotation automatically triggers a rollout.

Terminal window
# Change worker threads
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set config.workerThreads=16 \
--wait
# Change multiple settings at once
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
-f values-prod.yaml \
--set config.workerThreads=16 \
--set config.queryCacheSize=2000 \
--wait
Terminal window
# Logs for a specific pod
kubectl logs -f <pod-name> -n zeptodb
# Logs for all pods (stern recommended)
stern zeptodb -n zeptodb
# Previous crash logs
kubectl logs <pod-name> -n zeptodb --previous
# Logs since a specific time
kubectl logs <pod-name> -n zeptodb --since=1h
Terminal window
# Full rolling restart (zero-downtime)
kubectl rollout restart deployment/zeptodb -n zeptodb
# Delete a specific pod only (Deployment auto-recreates it)
kubectl delete pod <pod-name> -n zeptodb

Terminal window
# Enable ServiceMonitor (requires Prometheus Operator)
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set serviceMonitor.enabled=true \
--set serviceMonitor.interval=15s

In environments without ServiceMonitor, use Pod annotation-based scraping:

# Already included in deployment.yaml
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8123"
prometheus.io/path: "/metrics"
Terminal window
# Check directly
curl -s http://$LB:8123/metrics
MetricTypeAlert Threshold
zepto_server_upgauge== 0 → critical
zepto_server_readygauge== 0 for 5m → warning
zepto_ticks_ingested_totalcounterrate < 1000/s → warning
zepto_ticks_dropped_totalcounterrate > 1000/s → warning
zepto_queries_executed_totalcounterrate > 100/s → info
zepto_rows_scanned_totalcounterrate > 10M/s → warning

Defined in monitoring/zeptodb-alerts.yml:

AlertSeverityCondition
ApexDBDowncriticalzepto_server_up == 0 for 1m
ApexDBNotReadywarningzepto_server_ready == 0 for 5m
HighTickDropRatewarningdrop rate > 1000/s for 2m
HighQueryRateinfoquery rate > 100/s for 5m
HighRowScanRatewarningscan rate > 10M/s for 5m
LowIngestionRatewarningingestion < 1000/s for 10m
HighDiskUsagewarningdisk < 20% free for 5m
HighMemoryUsagewarningmemory < 10% free for 5m
HighCPUUsagewarningCPU > 90% for 10m
Terminal window
# Import dashboard
kubectl create configmap grafana-zeptodb \
-n monitoring \
--from-file=monitoring/grafana-dashboard.json
# Or import via Grafana UI → Import → monitoring/grafana-dashboard.json

Grafana can connect directly as a ClickHouse data source (port 8123, ClickHouse compatible API).


See EKS Cluster Requirements for full cluster setup including K8s version, Auto Mode, and custom NodePool configuration.

EKS Auto Mode includes built-in Karpenter — no separate install needed. Nodes are provisioned via EC2 Fleet API when pods are pending.

Terminal window
# Check node pools (built-in + custom)
kubectl get nodepools
kubectl get nodeclasses
# Check node claims (active nodes)
kubectl get nodeclaims
# Monitor scaling events
kubectl describe nodepool zepto-realtime
kubectl describe nodepool zepto-analytics

Two custom node pools are configured:

PoolTriggerCapacityConsolidation
zepto-realtimePending pods with zeptodb.com/role: realtimeOn-Demand onlyWhenEmpty, after 30m
zepto-analyticsPending pods with zeptodb.com/role: analyticsSpot + On-DemandWhenEmptyOrUnderutilized, after 5m

Scaling flow: HPA increases replicas → pods pending → Auto Mode provisions node (30-60s) → pods scheduled.

Default configuration: Auto-scales between 3–10 replicas based on CPU 70% / Memory 80% thresholds.

Terminal window
# Check HPA status
kubectl get hpa -n zeptodb
kubectl describe hpa zeptodb -n zeptodb
# Manual scale
kubectl scale deployment zeptodb -n zeptodb --replicas=5
# Change HPA settings
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set autoscaling.minReplicas=5 \
--set autoscaling.maxReplicas=20 \
--set autoscaling.targetCPU=60

For production ingest workloads, CPU/memory utilization is a poor proxy: a pod can be CPU-idle while its ring buffer saturates, or CPU-busy on queries while ingest is light. ZeptoDB exposes zepto_ingest_ticks_per_sec on GET /metrics (a per-pod gauge of the instantaneous ingest rate) so the HPA can autoscale on real ingest load. CPU/memory metrics remain configured on the same HPA as a safety net.

Prerequisites. The custom Pods metric requires prometheus-adapter to expose zepto_ingest_ticks_per_sec as pods/zepto_ingest_ticks_per_sec. A minimal rule snippet:

# prometheus-adapter ConfigMap
rules:
- seriesQuery: 'zepto_ingest_ticks_per_sec{namespace!="",pod!=""}'
resources:
overrides:
namespace: { resource: namespace }
pod: { resource: pod }
name:
matches: "^(.*)$"
as: "$1"
metricsQuery: |
avg_over_time(<<.Series>>{<<.LabelMatchers>>}[1m])

Enable on the chart. Off by default; set both flags on helm upgrade:

Terminal window
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set autoscaling.ingestRateEnabled=true \
--set autoscaling.targetIngestRate=50000 # ticks/sec per pod

targetIngestRate is AverageValue for the HPA Pods metric — scale-out is triggered when the per-pod 1-minute average ingest rate exceeds it. Tune to ~70–80% of a single pod’s measured sustained ingest ceiling (see docs/devlog/102_ingest_scale_phase1.md for the underlying ingest-path tunables and pipeline.drainThreads / pipeline.ringBufferCapacity).

Karpenter compatibility. No special config required — the standard HPA → pending pods → Karpenter scale-out path works unchanged. When the ingest-rate metric pushes HPA above currently-scheduled capacity, Karpenter provisions a new node from the realtime pool in the usual 30–60s.

# Already configured in values.yaml
autoscaling:
scaleDown:
stabilizationSeconds: 300 # Scale down after 5-minute stabilization
scaleUp:
stabilizationSeconds: 60 # Scale up after 1-minute stabilization
Terminal window
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set resources.requests.cpu=8 \
--set resources.requests.memory=32Gi \
--set resources.limits.cpu=16 \
--set resources.limits.memory=64Gi \
--wait
Terminal window
# Graviton (ARM) nodes
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set nodeSelector."kubernetes\.io/arch"=arm64
# Dedicated instance type
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set nodeSelector."node\.kubernetes\.io/instance-type"=c7g.4xlarge

Phase 1 of the ingest scale-out plan exposes two single-pod ingest knobs via Helm. Both default to 0 (engine default) so existing charts are unchanged.

Helm valueEngine default (when 0)Purpose
pipeline.drainThreadsmax(2, hw_concurrency() / 4)Number of drain threads moving ticks from TickPlant → storage. Lock-free MPMC → scales near-linearly.
pipeline.ringBufferCapacity65536 slotsTickPlant ring-buffer size. Absorbs ingest bursts before the synchronous store_tick() fallback (~34× slower) kicks in. Must be a power of two in [4096, 16777216].
WorkloadTags × ratepipeline.drainThreadspipeline.ringBufferCapacity
IoT pilot1 k × 1 Hz0 (auto)65536 (default)
Auto factory5 k × 100 Hz4262144
Semi fab (CMP burst)30 k × 10 kHz81048576
Terminal window
# Raise both for a CMP-burst semi-fab workload
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set pipeline.drainThreads=8 \
--set pipeline.ringBufferCapacity=1048576

Both effective values are emitted on ZeptoPipeline::start():

Terminal window
kubectl logs -n zeptodb zeptodb-0 | grep "drain_threads="
# [info] ZeptoPipeline 시작 완료 (drain_threads=8, ring_capacity=1048576)

Also exposed as pod env vars for K8s-side inspection: ZEPTO_DRAIN_THREADS, ZEPTO_RING_BUFFER_CAPACITY.

When TickPlant queue full! Dropping tick seq=… appears in logs, raise pipeline.ringBufferCapacity first (power of two), then pipeline.drainThreads. A non-power-of-two or out-of-range capacity causes the pod to fail fast at startup with a clear std::invalid_argument in the crash log.

This is a single-pod vertical scaling knob. Horizontal scale-out — a stateless zepto_ingest_node tier plus an ingest-rate HPA — is Phase 2 and is tracked in docs/BACKLOG.md under P8 — Cluster.


Sizing and placement for enterprise factory workloads

Section titled “Sizing and placement for enterprise factory workloads”

Horizontal scale-out only delivers linear ingest gain when each replica lands on a distinct node. The Helm chart defaults enforce this via hard podAntiAffinity plus topologySpread; see docs/devlog/104_pod_placement_hardening.md for the root-cause analysis. Use this table as a starting point and right-size per sector.

SectorReplicasNodesresources.requests (cpu / memory)podAntiAffinity.required
Dev / sandbox21 is OK1c / 2 Gifalse
Small IoT pilot332c / 4 Gi (default)true
Auto factory554c / 8 Gitrue
Semi fab (CMP / lithography)10108c / 16 Gitrue

Why required: true is the production default

Section titled “Why required: true is the production default”

A soft preferredDuringSchedulingIgnoredDuringExecution lets Kubernetes co-locate two pods on the same node as soon as HPA scales replicas > nodes. When that happens the two ZeptoDB processes fight for the same CPU, halving ingest throughput, and a single node failure takes down both replicas at once — breaking the scale-out guarantee silently.

Flip to required: false only on dev clusters where a tight fixed node count makes co-location acceptable (e.g. a 3-replica chart on a 1-node kind cluster). Everywhere else, leave it on and rely on EKS Auto Mode / Karpenter to provision the Nth node (typically 30–60 s per §5 above) when a hard antiAffinity leaves a pod Pending.

required: true alone refuses to schedule extras beyond the current node count. topologySpreadConstraints with maxSkew: 1 is the smarter complement when replicas > nodes is a legitimate transient state (brief HPA spike ahead of node provision, planned drain): it spreads pods as evenly as possible across hostnames and still allows more replicas than nodes. Set topologySpread.whenUnsatisfiable: ScheduleAnyway if you want the spread hint without the scheduling block.

  • CPU request = 1 core for HTTP/RPC + pipeline.drainThreads cores for ingest draining. If drainThreads is 0 (auto), the engine picks max(2, hw_concurrency / 4), so plan for 1 + 2 = 3 cores minimum on any node smaller than 16 vCPU. The 2c/4c default covers a 2-core drain pool plus HTTP/RPC, sized for ~200K–500K ticks/s.
  • Memory request = ~100 MB baseline + 32 MB per active arena + ringBufferCapacity × 64 bytes for the TickPlant ring. Example: 200 active partitions + 1 M-slot ring ≈ 100 MB + 6.4 GB + 64 MB ≈ 6.6 GB — round up to 8 GB limit for headroom. Raise limits.memory before raising pipeline.ringBufferCapacity.
  • Bare-metal trading (Guaranteed QoS) — pin requests.cpu == limits.cpu and requests.memory == limits.memory in an overlay. Keep hugepages-2Mi on both sides to retain HugePages reservation.
Terminal window
# Auto-factory profile (5 replicas × 5 nodes × 4c/8G)
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set replicaCount=5 \
--set autoscaling.minReplicas=5 \
--set resources.requests.cpu=4000m \
--set resources.requests.memory=8Gi \
--set resources.limits.cpu=8000m \
--set resources.limits.memory=16Gi
# Dev overlay (2 replicas on 1 node, co-location OK)
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set replicaCount=2 \
--set podAntiAffinity.required=false \
--set resources.requests.cpu=1000m \
--set resources.requests.memory=2Gi

deploy/k8s/backup-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: zeptodb-backup
namespace: zeptodb
spec:
schedule: "0 2 * * *" # Daily at 02:00 UTC
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: backup
image: amazon/aws-cli:latest
env:
- name: S3_BUCKET
value: "your-zeptodb-backups"
- name: DATA_DIR
value: "/opt/zeptodb/data"
command:
- /bin/sh
- -c
- |
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
tar -czf /tmp/zeptodb-${TIMESTAMP}.tar.gz -C ${DATA_DIR} .
aws s3 cp /tmp/zeptodb-${TIMESTAMP}.tar.gz \
s3://${S3_BUCKET}/backups/zeptodb-${TIMESTAMP}.tar.gz \
--storage-class STANDARD_IA
echo "Backup completed: zeptodb-${TIMESTAMP}.tar.gz"
volumeMounts:
- name: data
mountPath: /opt/zeptodb/data
readOnly: true
volumes:
- name: data
persistentVolumeClaim:
claimName: zeptodb-data
Terminal window
kubectl apply -f deploy/k8s/backup-cronjob.yaml
# Trigger manual backup
kubectl create job --from=cronjob/zeptodb-backup zeptodb-backup-manual -n zeptodb
# Check backup status
kubectl get jobs -n zeptodb
kubectl logs job/zeptodb-backup-manual -n zeptodb
Terminal window
# VolumeSnapshot (requires CSI driver)
cat <<EOF | kubectl apply -f -
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
name: zeptodb-snap-$(date +%Y%m%d)
namespace: zeptodb
spec:
volumeSnapshotClassName: ebs-csi-snapclass
source:
persistentVolumeClaimName: zeptodb-data
EOF
# Verify snapshot
kubectl get volumesnapshot -n zeptodb
Terminal window
# Create new PVC from snapshot
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: zeptodb-data-restored
namespace: zeptodb
spec:
accessModes: [ReadWriteOnce]
storageClassName: gp3
resources:
requests:
storage: 500Gi
dataSource:
name: zeptodb-snap-20260324
kind: VolumeSnapshot
apiGroup: snapshot.storage.k8s.io
EOF
# Replace PVC in Deployment
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set persistence.existingClaim=zeptodb-data-restored \
--wait

For details: Rolling Upgrade Guide

Terminal window
# 1. Pre-flight
kubectl get pods -n zeptodb -o wide
curl -s http://$LB:8123/health
# 2. Upgrade
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set image.tag=1.1.0 \
--wait --timeout 5m
# 3. Monitor
kubectl rollout status deployment/zeptodb -n zeptodb
# 4. Verify
curl -s http://$LB:8123/health
curl -X POST http://$LB:8123/ -d 'SELECT 1'
SettingValueEffect
maxSurge1Create 1 new pod first
maxUnavailable0Maintain existing pod count
PDB minAvailable2Guarantee minimum 2 pods
preStop sleep15sWait for in-flight queries to complete
readinessProbe/readyOnly ready pods receive traffic
Terminal window
# Immediate rollback
helm rollback zeptodb -n zeptodb
# Rollback to a specific revision
helm history zeptodb -n zeptodb
helm rollback zeptodb <REVISION> -n zeptodb
# kubectl rollback (without Helm)
kubectl rollout undo deployment/zeptodb -n zeptodb
Terminal window
# 1. Canary deployment (1 replica)
helm install zeptodb-canary ./deploy/helm/zeptodb -n zeptodb \
--set replicaCount=1 \
--set image.tag=2.0.0 \
--set service.type=ClusterIP \
--set autoscaling.enabled=false \
--set podDisruptionBudget.enabled=false
# 2. Canary testing
kubectl port-forward svc/zeptodb-canary 8124:8123 -n zeptodb
curl -X POST http://localhost:8124/ -d 'SELECT vwap(price, volume) FROM trades WHERE symbol = 1'
# 3a. Success → promote
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb --set image.tag=2.0.0 --wait
helm uninstall zeptodb-canary -n zeptodb
# 3b. Failure → remove
helm uninstall zeptodb-canary -n zeptodb

Terminal window
# Create TLS Secret
kubectl create secret tls zeptodb-tls \
-n zeptodb \
--cert=/path/to/cert.pem \
--key=/path/to/key.pem
# Ingress with TLS
cat <<EOF | kubectl apply -f -
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: zeptodb
namespace: zeptodb
annotations:
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
spec:
tls:
- hosts:
- zeptodb.example.com
secretName: zeptodb-tls
rules:
- host: zeptodb.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: zeptodb
port:
number: 8123
EOF
Terminal window
# API keys file
kubectl create secret generic zeptodb-auth \
-n zeptodb \
--from-file=keys.txt=/path/to/keys.txt
# JWT secret
kubectl create secret generic zeptodb-jwt \
-n zeptodb \
--from-literal=JWT_SECRET='your-jwt-secret'
# Vault integration (Secrets Store CSI)
# → SecretsProvider chain: Vault KV v2 → K8s file → env var
# Allow access only from same namespace + monitoring
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: zeptodb-netpol
namespace: zeptodb
spec:
podSelector:
matchLabels:
app.kubernetes.io/name: zeptodb
policyTypes: [Ingress]
ingress:
- from:
- namespaceSelector:
matchLabels:
name: zeptodb
- namespaceSelector:
matchLabels:
name: monitoring
ports:
- port: 8123
protocol: TCP
# Role for operators
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: zeptodb-operator
namespace: zeptodb
rules:
- apiGroups: ["", "apps", "autoscaling"]
resources: ["pods", "deployments", "services", "configmaps", "hpa"]
verbs: ["get", "list", "watch", "update", "patch"]
- apiGroups: [""]
resources: ["pods/log", "pods/exec"]
verbs: ["get", "create"]

For operating a ZeptoDB distributed cluster on Kubernetes.

Terminal window
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set cluster.enabled=true \
--set cluster.rpcPortOffset=100 \
--set cluster.heartbeatPort=9100 \
--set headless.enabled=true

Direct pod-to-pod communication via Headless Service:

  • RPC: <pod-name>.zeptodb-headless.zeptodb.svc:8223
  • Heartbeat: UDP :9100

When cluster mode is enabled, every pod automatically wires a CoordinatorRoutingAdapter so that HTTP/SQL INSERT statements and Python Pipeline.ingest_* calls are routed to the partition owner via the PartitionRouter consistent-hash ring. Without this wire-up (the state before devlog 111), writes would land on whichever pod the Service LoadBalancer happened to pick, silently mis-partitioning data.

Verify the routing is live by checking any pod’s startup log:

Terminal window
kubectl logs -n zeptodb zeptodb-0 | grep -E 'Cluster routing|Peer RPC'
# Expected output:
# Peer RPC server: port 8223
# Cluster routing: enabled (N remote nodes)

Feed consumers (KafkaConsumer, MqttConsumer, OpcUaConsumer) route through their own set_routing() hook, bypassing the HTTP LB entirely — use them as the primary ingest path for production multi-pod deployments.

DDL replication (devlog 112). CREATE / DROP / ALTER TABLE sent to any pod is fire-and-forget replicated to every remote pod via QueryCoordinator::forward_ddl_to_remotes. Per-remote failures emit ZEPTO_WARN but never fail the client request, so operators should still pre-provision critical tables at deploy time if a pod might be unreachable at DDL time.

For workloads where ingest load scales independently of query/storage load, deploy a dedicated stateless ingest tier (P8-I3, devlog 113). Each ingest pod runs the zepto_ingest_node binary, holds zero data, and forwards every HTTP INSERT to the correct storage pod via CoordinatorRoutingAdapter (same routing path as devlog 111).

Topology:

clients ──► ingest Service (ClusterIP) ──► N × zepto_ingest_node pods
│ (owns no data,
│ node_id=99999)
TCP RPC fan-out
storage StatefulSet (zeptodb-N)
— owns partitions, runs queries

Enable via Helm (opt-in):

Terminal window
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set ingest.enabled=true \
--set ingest.replicas=3 \
--set-string 'ingest.extraArgs={--add-node,0:zeptodb-0.zeptodb-headless:8123,--add-node,1:zeptodb-1.zeptodb-headless:8123,--add-node,2:zeptodb-2.zeptodb-headless:8123}'

Notes:

  • Storage-pod discovery is currently manual via ingest.extraArgs. A future init container will generate --add-node flags from the headless service automatically.
  • ingest.noAuth: true is the default — put the ingest Service behind an auth-enforcing ingress if you expose it outside the cluster.
  • The ingest tier can be scaled with its own HPA independently of the storage StatefulSet. Ingest-rate HPA on zepto_pipeline_ticks_per_sec is tracked as BACKLOG P8-I4.
  • DDL (CREATE / DROP / ALTER TABLE) sent to an ingest pod replicates to every storage pod automatically via devlog 112’s forward_ddl_to_remotes.
Terminal window
# Check cluster status for each pod
for pod in $(kubectl get pods -n zeptodb -l app.kubernetes.io/name=zeptodb -o name); do
echo "--- $pod ---"
kubectl exec -n zeptodb $pod -- curl -s http://localhost:8123/health
echo
done
  • CoordinatorHA handles automatic re-registration
  • FencingToken prevents split-brain
  • Increase gracefulShutdown time during upgrades to ensure WAL flush
Terminal window
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set image.tag=1.1.0 \
--set gracefulShutdown.preStopSleepSeconds=30 \
--set gracefulShutdown.terminationGracePeriodSeconds=60 \
--wait --timeout 10m

Terminal window
# Check status
kubectl describe pod <pod> -n zeptodb
# Common causes:
# - ImagePullBackOff → Check image path/authentication
# - Pending → Insufficient resources (kubectl describe node)
# - CrashLoopBackOff → Check logs (kubectl logs --previous)
Terminal window
kubectl logs <pod> -n zeptodb | grep -i "error\|fail\|ready"
# Check directly from inside the pod
kubectl exec -n zeptodb <pod> -- curl -s http://localhost:8123/ready
Terminal window
kubectl describe pvc zeptodb-data -n zeptodb
# Check StorageClass
kubectl get sc
# If gp3 StorageClass does not exist, it needs to be created
Terminal window
# Check memory usage
kubectl top pods -n zeptodb
# Increase limits
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb \
--set resources.limits.memory=64Gi \
--wait
Terminal window
# Check query plan with EXPLAIN
curl -X POST http://$LB:8123/ -d 'EXPLAIN SELECT ...'
# Check running queries via Admin API
curl -H "Authorization: Bearer $ADMIN_KEY" http://$LB:8123/admin/queries
# Kill slow query
curl -X DELETE -H "Authorization: Bearer $ADMIN_KEY" \
http://$LB:8123/admin/queries/<query-id>
Terminal window
kubectl describe hpa zeptodb -n zeptodb
# Check metrics-server
kubectl top pods -n zeptodb
# "error: Metrics API not available" → metrics-server needs to be installed

Terminal window
# 1. Record current state
kubectl get pods -n zeptodb -o wide > /tmp/zeptodb-state.txt
# 2. Rolling restart (zero-downtime)
kubectl rollout restart deployment/zeptodb -n zeptodb
kubectl rollout status deployment/zeptodb -n zeptodb --timeout=5m
# 3. Verify
curl -s http://$LB:8123/health
curl -X POST http://$LB:8123/ -d 'SELECT 1'
Terminal window
# 1. Check
kubectl exec -n zeptodb <pod> -- df -h /opt/zeptodb/data
# 2. Clean up old HDB data (TTL setting)
curl -X POST http://$LB:8123/ \
-d "ALTER TABLE trades SET TTL 90 DAYS"
# 3. Expand PVC (if StorageClass has allowVolumeExpansion: true)
kubectl patch pvc zeptodb-data -n zeptodb \
-p '{"spec":{"resources":{"requests":{"storage":"1Ti"}}}}'
Terminal window
# PDB guarantees minAvailable: 2, so drain is safe
kubectl drain <node> --ignore-daemonsets --delete-emptydir-data
# After maintenance is complete
kubectl uncordon <node>
Terminal window
# 1. Backup
kubectl create job --from=cronjob/zeptodb-backup zeptodb-pre-redeploy -n zeptodb
kubectl wait --for=condition=complete job/zeptodb-pre-redeploy -n zeptodb --timeout=10m
# 2. Delete
helm uninstall zeptodb -n zeptodb
# PVC is preserved (not deleted by helm uninstall)
# 3. Redeploy
helm install zeptodb ./deploy/helm/zeptodb -n zeptodb -f values-prod.yaml --wait
# 4. Verify
curl -s http://$LB:8123/health

Terminal window
# === Status ===
kubectl get all -n zeptodb
kubectl get hpa -n zeptodb
kubectl get pvc -n zeptodb
kubectl get events -n zeptodb --sort-by='.lastTimestamp' | tail -20
# === Logs ===
kubectl logs -f deployment/zeptodb -n zeptodb
kubectl logs <pod> -n zeptodb --previous
# === Health ===
curl http://$LB:8123/health
curl http://$LB:8123/ready
curl http://$LB:8123/metrics
# === Helm ===
helm list -n zeptodb
helm history zeptodb -n zeptodb
helm get values zeptodb -n zeptodb
# === Upgrade ===
helm upgrade zeptodb ./deploy/helm/zeptodb -n zeptodb --set image.tag=X.Y.Z --wait
helm rollback zeptodb -n zeptodb
# === Scale ===
kubectl scale deployment zeptodb -n zeptodb --replicas=5
kubectl top pods -n zeptodb