Skip to content

Health Monitoring

Tutorial

Health Monitoring

Monitor platform health, cluster status, and instance capacity

20 min Intermediate
HealthMonitoringClusterOps

What You'll Learn

  • Liveness Probes — Basic health check with client.health.check()
  • Readiness Probes — Full readiness including database with client.health.ready()
  • Cluster Health — Per-component status with ops.ops.get_cluster_health()
  • Instance Capacity — Cluster-wide limits and usage with ops.ops.get_cluster_instances()
  • Prometheus Metrics — Raw metrics endpoint with ops.ops.get_metrics()
1

Setup

Connect as Dave (ops persona)

# Cell 1 — Parameters
USERNAME = "_FILL_ME_IN_" # Set your email before running
# Cell 2 — Connect
from graph_olap import GraphOLAPClient
client = GraphOLAPClient(username=USERNAME)
# Cell 3 — Provision
from notebook_setup import provision
personas, conn = provision(USERNAME)
analyst = personas["analyst"]
admin = personas["admin"]
ops = personas["ops"]
print(f"Connected to: {ops._config.api_url}")
2

Liveness Probes

Basic health check — is the service running?

# Liveness check — unauthenticated, returns status and version
# This is what Kubernetes uses for its liveness probe.
# A passing liveness check means the process is running and
# accepting HTTP connections — nothing more.
liveness = client.health.check()
print(f"Status: {liveness.status}")
print(f"Version: {liveness.version}")
3

Readiness Probes

Full readiness check including database connectivity

# Readiness check — also unauthenticated, but verifies database
# connectivity. Kubernetes uses this for its readiness probe.
# A pod only receives traffic when the readiness probe passes.
#
# Liveness vs Readiness:
# - Liveness: "Is the process alive?" → restart if failing
# - Readiness: "Can it serve requests?" → remove from LB if failing
readiness = client.health.ready()
print(f"Status: {readiness.status}")
print(f"Version: {readiness.version}")
print(f"Database: {readiness.database}")
4

Cluster Health

Per-component health with latency data

# Cluster health — requires ops role
# Returns overall status plus per-component breakdown with latency
health = ops.ops.get_cluster_health()
print(f"Overall status: {health.status}")
print(f"Checked at: {health.checked_at}")
print()
print(f"{'Component':<20} {'Status':<12} {'Latency (ms)':>12}")
print("-" * 46)
for name, comp in health.components.items():
latency = f"{comp.latency_ms:.1f}" if comp.latency_ms is not None else "N/A"
print(f"{name:<20} {comp.status:<12} {latency:>12}")
# Detect degraded or unhealthy components
# In production, you would alert on non-healthy components
unhealthy = [
(name, comp)
for name, comp in health.components.items()
if comp.status != "healthy"
]
if unhealthy:
print("WARNING: Non-healthy components detected:")
for name, comp in unhealthy:
print(f" {name}: {comp.status} (latency: {comp.latency_ms}ms)")
else:
print("All components healthy")
5

Instance Capacity

Track instance counts, ownership, and cluster limits

# Cluster instance summary — requires ops role
instances = ops.ops.get_cluster_instances()
print(f"Total instances: {instances.total}")
print()
# Breakdown by status
print("By status:")
for status, count in instances.by_status.items():
print(f" {status:<15} {count}")
print()
# Breakdown by owner (list of OwnerInstanceCount objects)
print("By owner:")
for entry in instances.by_owner:
print(f" {entry}")
# Cluster capacity limits
limits = instances.limits
print("Capacity limits:")
print(f" Per analyst: {limits.per_analyst}")
print(f" Cluster total: {limits.cluster_total}")
print(f" Cluster used: {limits.cluster_used}")
print(f" Cluster available: {limits.cluster_available}")
print()
# Calculate utilisation percentage
utilisation = (limits.cluster_used / limits.cluster_total) * 100
print(f"Cluster utilisation: {utilisation:.0f}%")
6

Prometheus Metrics

Raw metrics for scraping and ad-hoc inspection

# Prometheus metrics — returns raw text in Prometheus exposition format
# Typically scraped by Prometheus, but useful for ad-hoc inspection
metrics_text = ops.ops.get_metrics()
# Show the first 10 lines as a preview
lines = metrics_text.strip().splitlines()
print(f"Total metric lines: {len(lines)}")
print()
for line in lines[:10]:
print(line)
7

Building a Health Dashboard

Combine health data into a single summary

# Combine all health signals into a single dashboard summary
liveness = client.health.check()
readiness = client.health.ready()
health = ops.ops.get_cluster_health()
instances = ops.ops.get_cluster_instances()
dashboard = {
"platform": {
"liveness": liveness.status,
"readiness": readiness.status,
"database": readiness.database,
"version": liveness.version,
},
"cluster": {
"status": health.status,
"checked_at": str(health.checked_at),
"components": {
name: {"status": c.status, "latency_ms": c.latency_ms}
for name, c in health.components.items()
},
},
"capacity": {
"total": instances.total,
"running": instances.by_status.get("running", 0),
"available": instances.limits.cluster_available,
"limit": instances.limits.cluster_total,
},
}
# Pretty-print the dashboard
import json
print(json.dumps(dashboard, indent=2))

Key Takeaways

  • client.health.check() is the liveness probe — unauthenticated, returns status and version
  • client.health.ready() is the readiness probe — also checks database connectivity
  • ops.ops.get_cluster_health() returns per-component status with latency — requires ops role
  • ops.ops.get_cluster_instances() gives instance counts by status/owner plus capacity limits
  • ops.ops.get_metrics() returns Prometheus-format text for scraping or ad-hoc inspection
  • Combine all health signals into a single dashboard dict for unified monitoring