Platform Observability
Observability Stack
Full-stack observability with Prometheus, Grafana, and OpenTelemetry providing real-time insights across 2.8M daily requests with <1s alerting latency.
125K
Metrics/Second
10%
Trace Samples
<1s
Alert Latency
45+
Dashboard Views
Prometheus Configuration
Highly available Prometheus setup with federation, long-term storage via Thanos, and service discovery.
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'prod-us-east-1'
env: 'production'
rule_files:
- /etc/prometheus/rules/*.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Kubernetes pods with prometheus annotations
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
metric_relabel_configs:
# Drop high-cardinality metrics
- source_labels: [__name__]
regex: 'go_gc_.*'
action: drop
# API servers
- job_name: 'justkalm-api'
static_configs:
- targets:
- api-1:8000
- api-2:8000
- api-3:8000
metrics_path: /metrics
scheme: http
# PostgreSQL exporter
- job_name: 'postgres'
static_configs:
- targets:
- postgres-exporter:9187
# Redis exporter
- job_name: 'redis'
static_configs:
- targets:
- redis-exporter:9121
remote_write:
- url: http://thanos-receive:19291/api/v1/receive
queue_config:
max_samples_per_send: 10000
batch_send_deadline: 5s
capacity: 50000Application Metrics
# metrics/instrumentation.py
from prometheus_client import Counter, Histogram, Gauge, Info
from functools import wraps
import time
# Request metrics
REQUEST_COUNT = Counter(
'justkalm_http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
REQUEST_LATENCY = Histogram(
'justkalm_http_request_duration_seconds',
'HTTP request latency',
['method', 'endpoint'],
buckets=[.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0]
)
ACTIVE_REQUESTS = Gauge(
'justkalm_http_requests_active',
'Active HTTP requests',
['method', 'endpoint']
)
# Business metrics
VALUATIONS_TOTAL = Counter(
'justkalm_valuations_total',
'Total product valuations',
['brand', 'category', 'source']
)
VALUATION_CONFIDENCE = Histogram(
'justkalm_valuation_confidence',
'Valuation confidence distribution',
['brand'],
buckets=[0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 0.99]
)
ML_INFERENCE_LATENCY = Histogram(
'justkalm_ml_inference_seconds',
'ML model inference time',
['model', 'version'],
buckets=[.01, .025, .05, .1, .25, .5, 1.0, 2.5, 5.0]
)
# Cache metrics
CACHE_HITS = Counter(
'justkalm_cache_hits_total',
'Cache hit count',
['cache_type', 'operation']
)
CACHE_MISSES = Counter(
'justkalm_cache_misses_total',
'Cache miss count',
['cache_type', 'operation']
)
def track_request(method: str, endpoint: str):
"""Decorator to track request metrics."""
def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).inc()
start = time.time()
try:
result = await func(*args, **kwargs)
REQUEST_COUNT.labels(
method=method,
endpoint=endpoint,
status=getattr(result, 'status_code', 200)
).inc()
return result
except Exception as e:
REQUEST_COUNT.labels(
method=method,
endpoint=endpoint,
status=500
).inc()
raise
finally:
REQUEST_LATENCY.labels(
method=method,
endpoint=endpoint
).observe(time.time() - start)
ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).dec()
return wrapper
return decoratorGrafana Dashboard Configuration
// dashboards/api-overview.json
{
"dashboard": {
"title": "JustKalm API Overview",
"tags": ["api", "production", "slo"],
"refresh": "10s",
"panels": [
{
"title": "Request Rate",
"type": "timeseries",
"gridPos": { "x": 0, "y": 0, "w": 8, "h": 8 },
"targets": [{
"expr": "sum(rate(justkalm_http_requests_total[5m])) by (endpoint)",
"legendFormat": "{{endpoint}}"
}]
},
{
"title": "P99 Latency",
"type": "timeseries",
"gridPos": { "x": 8, "y": 0, "w": 8, "h": 8 },
"targets": [{
"expr": "histogram_quantile(0.99, sum(rate(justkalm_http_request_duration_seconds_bucket[5m])) by (le, endpoint))",
"legendFormat": "{{endpoint}}"
}],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "red", "value": 1.0 }
]
}
}
}
},
{
"title": "Error Rate",
"type": "stat",
"gridPos": { "x": 16, "y": 0, "w": 8, "h": 4 },
"targets": [{
"expr": "sum(rate(justkalm_http_requests_total{status=~'5..'}[5m])) / sum(rate(justkalm_http_requests_total[5m])) * 100"
}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.1 },
{ "color": "red", "value": 1.0 }
]
}
}
}
},
{
"title": "SLO: Availability",
"type": "gauge",
"gridPos": { "x": 16, "y": 4, "w": 8, "h": 4 },
"targets": [{
"expr": "(1 - sum(increase(justkalm_http_requests_total{status=~'5..'}[30d])) / sum(increase(justkalm_http_requests_total[30d]))) * 100"
}],
"fieldConfig": {
"defaults": {
"min": 99,
"max": 100,
"thresholds": {
"steps": [
{ "color": "red", "value": 99 },
{ "color": "yellow", "value": 99.5 },
{ "color": "green", "value": 99.9 }
]
}
}
}
}
]
}
}Complete Observability Coverage
From metrics to traces to logs - full visibility into every request.
125K Metrics/sec10% Trace Sampling<1s Alert Latency