Platform Observability

Observability Stack

Full-stack observability with Prometheus, Grafana, and OpenTelemetry providing real-time insights across 2.8M daily requests with <1s alerting latency.

125K

Metrics/Second

10%

Trace Samples

<1s

Alert Latency

45+

Dashboard Views

Prometheus Configuration

Highly available Prometheus setup with federation, long-term storage via Thanos, and service discovery.

# prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'prod-us-east-1'
    env: 'production'

rule_files:
  - /etc/prometheus/rules/*.yml

alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

scrape_configs:
  # Kubernetes pods with prometheus annotations
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
    metric_relabel_configs:
      # Drop high-cardinality metrics
      - source_labels: [__name__]
        regex: 'go_gc_.*'
        action: drop

  # API servers
  - job_name: 'justkalm-api'
    static_configs:
      - targets:
          - api-1:8000
          - api-2:8000
          - api-3:8000
    metrics_path: /metrics
    scheme: http

  # PostgreSQL exporter
  - job_name: 'postgres'
    static_configs:
      - targets:
          - postgres-exporter:9187

  # Redis exporter
  - job_name: 'redis'
    static_configs:
      - targets:
          - redis-exporter:9121

remote_write:
  - url: http://thanos-receive:19291/api/v1/receive
    queue_config:
      max_samples_per_send: 10000
      batch_send_deadline: 5s
      capacity: 50000

Application Metrics

# metrics/instrumentation.py
from prometheus_client import Counter, Histogram, Gauge, Info
from functools import wraps
import time

# Request metrics
REQUEST_COUNT = Counter(
    'justkalm_http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status']
)

REQUEST_LATENCY = Histogram(
    'justkalm_http_request_duration_seconds',
    'HTTP request latency',
    ['method', 'endpoint'],
    buckets=[.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0]
)

ACTIVE_REQUESTS = Gauge(
    'justkalm_http_requests_active',
    'Active HTTP requests',
    ['method', 'endpoint']
)

# Business metrics
VALUATIONS_TOTAL = Counter(
    'justkalm_valuations_total',
    'Total product valuations',
    ['brand', 'category', 'source']
)

VALUATION_CONFIDENCE = Histogram(
    'justkalm_valuation_confidence',
    'Valuation confidence distribution',
    ['brand'],
    buckets=[0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 0.99]
)

ML_INFERENCE_LATENCY = Histogram(
    'justkalm_ml_inference_seconds',
    'ML model inference time',
    ['model', 'version'],
    buckets=[.01, .025, .05, .1, .25, .5, 1.0, 2.5, 5.0]
)

# Cache metrics
CACHE_HITS = Counter(
    'justkalm_cache_hits_total',
    'Cache hit count',
    ['cache_type', 'operation']
)

CACHE_MISSES = Counter(
    'justkalm_cache_misses_total',
    'Cache miss count',
    ['cache_type', 'operation']
)

def track_request(method: str, endpoint: str):
    """Decorator to track request metrics."""
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).inc()
            start = time.time()
            try:
                result = await func(*args, **kwargs)
                REQUEST_COUNT.labels(
                    method=method,
                    endpoint=endpoint,
                    status=getattr(result, 'status_code', 200)
                ).inc()
                return result
            except Exception as e:
                REQUEST_COUNT.labels(
                    method=method,
                    endpoint=endpoint,
                    status=500
                ).inc()
                raise
            finally:
                REQUEST_LATENCY.labels(
                    method=method,
                    endpoint=endpoint
                ).observe(time.time() - start)
                ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).dec()
        return wrapper
    return decorator

Grafana Dashboard Configuration

// dashboards/api-overview.json
{
  "dashboard": {
    "title": "JustKalm API Overview",
    "tags": ["api", "production", "slo"],
    "refresh": "10s",
    "panels": [
      {
        "title": "Request Rate",
        "type": "timeseries",
        "gridPos": { "x": 0, "y": 0, "w": 8, "h": 8 },
        "targets": [{
          "expr": "sum(rate(justkalm_http_requests_total[5m])) by (endpoint)",
          "legendFormat": "{{endpoint}}"
        }]
      },
      {
        "title": "P99 Latency",
        "type": "timeseries",
        "gridPos": { "x": 8, "y": 0, "w": 8, "h": 8 },
        "targets": [{
          "expr": "histogram_quantile(0.99, sum(rate(justkalm_http_request_duration_seconds_bucket[5m])) by (le, endpoint))",
          "legendFormat": "{{endpoint}}"
        }],
        "fieldConfig": {
          "defaults": {
            "unit": "s",
            "thresholds": {
              "mode": "absolute",
              "steps": [
                { "color": "green", "value": null },
                { "color": "yellow", "value": 0.5 },
                { "color": "red", "value": 1.0 }
              ]
            }
          }
        }
      },
      {
        "title": "Error Rate",
        "type": "stat",
        "gridPos": { "x": 16, "y": 0, "w": 8, "h": 4 },
        "targets": [{
          "expr": "sum(rate(justkalm_http_requests_total{status=~'5..'}[5m])) / sum(rate(justkalm_http_requests_total[5m])) * 100"
        }],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                { "color": "green", "value": null },
                { "color": "yellow", "value": 0.1 },
                { "color": "red", "value": 1.0 }
              ]
            }
          }
        }
      },
      {
        "title": "SLO: Availability",
        "type": "gauge",
        "gridPos": { "x": 16, "y": 4, "w": 8, "h": 4 },
        "targets": [{
          "expr": "(1 - sum(increase(justkalm_http_requests_total{status=~'5..'}[30d])) / sum(increase(justkalm_http_requests_total[30d]))) * 100"
        }],
        "fieldConfig": {
          "defaults": {
            "min": 99,
            "max": 100,
            "thresholds": {
              "steps": [
                { "color": "red", "value": 99 },
                { "color": "yellow", "value": 99.5 },
                { "color": "green", "value": 99.9 }
              ]
            }
          }
        }
      }
    ]
  }
}

Complete Observability Coverage

From metrics to traces to logs - full visibility into every request.

125K Metrics/sec10% Trace Sampling<1s Alert Latency