From 70c3ee0d48b70a9a216fd6061ba61008d674cf50 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Wed, 10 Jun 2026 15:30:27 +0200
Subject: [PATCH] docs: add usage guides backed by a real compiling samples
 module (#101, #102, #105)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fill the documentation gap for tensor usage/ops (#101), benchmark/perf/metrics
usage (#102), and layout/simple-usage/samples (#105) with real, CI-verified code
instead of hand-typed snippets.

New `skainet-docs-samples` gradle module whose Kotlin sources ARE the Antora
example resources (commonMain srcDir -> docs/modules/ROOT/examples/kotlin), so every
snippet is compiled and executed by `:skainet-docs-samples:jvmTest`. Antora pages
pull tagged regions via `include::example$kotlin/...[tag=...]`.

Demos (all using the public DSLs):
- TensorBasics.kt — data-DSL construction, init strategies, eager ops, broadcasting
- Quickstart.kt   — sequential { } model + forward
- TrainingDemo.kt — training { } loop + accuracy metric

Pages:
- rewrite explanation/examples/index.adoc (replace 6 TODO stubs with live includes)
- new tutorials/kotlin-getting-started.adoc
- new how-to/tensor-ops.adoc
- new how-to/metrics-and-perf-testing.adoc (links existing benchmark guides)
- nav.adoc xrefs

Closes #101
Closes #102
Closes #105

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../sk/ainet/docs/samples/Quickstart.kt       | 50 ++++++++++
 .../sk/ainet/docs/samples/TensorBasics.kt     | 88 +++++++++++++++++
 .../sk/ainet/docs/samples/TrainingDemo.kt     | 97 +++++++++++++++++++
 docs/modules/ROOT/nav.adoc                    |  3 +
 .../pages/explanation/examples/index.adoc     | 77 ++++++++++-----
 .../how-to/metrics-and-perf-testing.adoc      | 61 ++++++++++++
 .../modules/ROOT/pages/how-to/tensor-ops.adoc | 59 +++++++++++
 .../tutorials/kotlin-getting-started.adoc     | 59 +++++++++++
 settings.gradle.kts                           |  3 +
 skainet-docs-samples/build.gradle.kts         | 39 ++++++++
 .../sk/ainet/docs/samples/SamplesTest.kt      | 45 +++++++++
 11 files changed, 556 insertions(+), 25 deletions(-)
 create mode 100644 docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/Quickstart.kt
 create mode 100644 docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TensorBasics.kt
 create mode 100644 docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TrainingDemo.kt
 create mode 100644 docs/modules/ROOT/pages/how-to/metrics-and-perf-testing.adoc
 create mode 100644 docs/modules/ROOT/pages/how-to/tensor-ops.adoc
 create mode 100644 docs/modules/ROOT/pages/tutorials/kotlin-getting-started.adoc
 create mode 100644 skainet-docs-samples/build.gradle.kts
 create mode 100644 skainet-docs-samples/src/commonTest/kotlin/sk/ainet/docs/samples/SamplesTest.kt

diff --git a/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/Quickstart.kt b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/Quickstart.kt
new file mode 100644
index 00000000..bdc8173d
--- /dev/null
+++ b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/Quickstart.kt
@@ -0,0 +1,50 @@
+package sk.ainet.docs.samples
+
+// tag::imports[]
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.lang.nn.dsl.sequential
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.relu
+import sk.ainet.lang.tensor.softmax
+import sk.ainet.lang.types.FP32
+// end::imports[]
+
+/**
+ * Flagship "hello world" for the SKaiNET model DSL: define a small MLP with the
+ * `sequential { }` DSL, then run a single forward pass on the CPU.
+ *
+ * Every region below is included verbatim into
+ * `tutorials/kotlin-getting-started.adoc`. The companion test
+ * (`QuickstartTest`) runs this end-to-end so the snippet can never drift.
+ */
+object Quickstart {
+
+    /**
+     * Build a `784 -> 128 (ReLU) -> 10 (Softmax)` classifier purely in code.
+     * This is the SKaiNET "spirit": the network *is* a Kotlin DSL block.
+     */
+    // tag::model[]
+    fun buildModel(ctx: DirectCpuExecutionContext) =
+        sequential<FP32, Float>(ctx) {
+            input(784)                                  // 28x28 flattened
+            dense(128) { activation = { it.relu() } }   // hidden layer
+            dense(10) { activation = { it.softmax(1) } } // class scores
+        }
+    // end::model[]
+
+    /**
+     * Run one forward pass over a batch of one sample and return the 10 class scores.
+     */
+    // tag::infer[]
+    fun classify(pixels: FloatArray): Tensor<FP32, Float> {
+        val ctx = DirectCpuExecutionContext.create()
+        val model = buildModel(ctx)
+
+        // Shape is [batch, features]; one sample here.
+        val input = ctx.fromFloatArray<FP32, Float>(Shape(1, 784), FP32::class, pixels)
+
+        return model.forward(input, ctx)               // [1, 10] class scores
+    }
+    // end::infer[]
+}
diff --git a/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TensorBasics.kt b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TensorBasics.kt
new file mode 100644
index 00000000..8d7265ea
--- /dev/null
+++ b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TensorBasics.kt
@@ -0,0 +1,88 @@
+package sk.ainet.docs.samples
+
+// tag::imports[]
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.context.data
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.dsl.tensor
+import sk.ainet.lang.tensor.matmul
+import sk.ainet.lang.tensor.plus
+import sk.ainet.lang.tensor.relu
+import sk.ainet.lang.tensor.reshape
+import sk.ainet.lang.tensor.t
+import sk.ainet.lang.types.FP32
+// end::imports[]
+
+/**
+ * Tensor construction and the everyday eager operations, all real and CI-run.
+ * Regions here are included by `how-to/tensor-ops.adoc` and the usage-examples page.
+ */
+object TensorBasics {
+
+    /** Build a single 2x2 tensor with the `data { }` DSL — the idiomatic form. */
+    // tag::create-one[]
+    fun oneTensor(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> =
+        data<FP32, Float>(ctx) {
+            tensor {
+                shape(2, 2) { from(1f, 2f, 3f, 4f) }
+            }
+        }
+    // end::create-one[]
+
+    /** The initialization strategies available inside `shape(...) { ... }`. */
+    // tag::init[]
+    fun initStrategies(ctx: DirectCpuExecutionContext): List<Tensor<FP32, Float>> {
+        lateinit var zeros: Tensor<FP32, Float>
+        lateinit var ones: Tensor<FP32, Float>
+        lateinit var filled: Tensor<FP32, Float>
+        lateinit var gaussian: Tensor<FP32, Float>
+        lateinit var ramp: Tensor<FP32, Float>
+        data(ctx) {
+            zeros = tensor { shape(2, 3) { zeros() } }
+            ones = tensor { shape(2, 3) { ones() } }
+            filled = tensor { shape(2, 3) { full(0.5f) } }
+            gaussian = tensor { shape(2, 3) { randn(mean = 0f, std = 0.02f) } }
+            ramp = tensor { shape(2, 3) { init { idx -> (idx[0] + idx[1]).toFloat() } } }
+        }
+        return listOf(zeros, ones, filled, gaussian, ramp)
+    }
+    // end::init[]
+
+    /**
+     * Everyday eager ops: matrix multiply, transpose, reshape, ReLU. Each op is an
+     * extension on `Tensor` — no execution context to thread through, the tensor
+     * carries its own backend ops.
+     */
+    // tag::ops[]
+    fun ops(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> {
+        lateinit var a: Tensor<FP32, Float>
+        lateinit var b: Tensor<FP32, Float>
+        data(ctx) {
+            a = tensor { shape(2, 3) { from(1f, 2f, 3f, 4f, 5f, 6f) } }
+            b = tensor { shape(3, 2) { from(1f, 0f, 0f, 1f, 1f, 1f) } }
+        }
+        val product = a.matmul(b)        // [2,3] x [3,2] -> [2,2]
+        val transposed = product.t()     // [2,2] -> [2,2]
+        val flat = transposed.reshape(Shape(4))
+        return flat.relu()
+    }
+    // end::ops[]
+
+    /**
+     * Broadcasting: a per-column bias of shape [1,3] is added across both rows of a
+     * [2,3] matrix. A scalar broadcasts to every element.
+     */
+    // tag::broadcast[]
+    fun broadcast(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> {
+        lateinit var matrix: Tensor<FP32, Float>
+        lateinit var bias: Tensor<FP32, Float>
+        data(ctx) {
+            matrix = tensor { shape(2, 3) { from(1f, 2f, 3f, 4f, 5f, 6f) } }
+            bias = tensor { shape(1, 3) { from(10f, 20f, 30f) } }
+        }
+        val biased = matrix + bias       // [2,3] + [1,3] -> [2,3]
+        return biased + 100f             // scalar broadcasts to every element
+    }
+    // end::broadcast[]
+}
diff --git a/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TrainingDemo.kt b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TrainingDemo.kt
new file mode 100644
index 00000000..ee95e39f
--- /dev/null
+++ b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TrainingDemo.kt
@@ -0,0 +1,97 @@
+package sk.ainet.docs.samples
+
+// tag::imports[]
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.context.Phase
+import sk.ainet.lang.graph.DefaultGradientTape
+import sk.ainet.lang.graph.DefaultGraphExecutionContext
+import sk.ainet.lang.nn.dsl.sequential
+import sk.ainet.lang.nn.dsl.training
+import sk.ainet.lang.nn.loss.MSELoss
+import sk.ainet.lang.nn.optim.sgd
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.tanh
+import sk.ainet.lang.types.FP32
+import kotlin.random.Random
+// end::imports[]
+
+/**
+ * End-to-end training with the SKaiNET `training { }` DSL: a tiny two-cluster
+ * classification task learned by an MLP `[2, 8, 1]` with `tanh` activations.
+ *
+ * Demonstrates the pieces issue #102 asks to document — running a model, a loss,
+ * an optimizer, and a metric (classification accuracy) — as real, CI-run code.
+ */
+object TrainingDemo {
+
+    /** firstLoss/lastLoss show learning; accuracy is the held-out metric. */
+    data class Result(val firstLoss: Float, val lastLoss: Float, val accuracy: Float)
+
+    // Two linearly separable clusters: label +1 near (1,1), label -1 near (-1,-1).
+    private val featuresFlat = floatArrayOf(
+        1.0f, 1.1f, 0.9f, 1.2f, 1.2f, 0.8f, 1.1f, 0.9f,
+        -1.0f, -1.1f, -0.9f, -1.2f, -1.2f, -0.8f, -1.1f, -0.9f,
+    )
+    private val labelsFlat = floatArrayOf(1f, 1f, 1f, 1f, -1f, -1f, -1f, -1f)
+
+    fun run(): Result {
+        val n = labelsFlat.size
+
+        // tag::setup[]
+        // A graph (autograd) context for training; a plain CPU context for inference.
+        val baseCtx = DirectCpuExecutionContext()
+        val trainCtx = DefaultGraphExecutionContext(
+            baseOps = baseCtx.ops,
+            phase = Phase.TRAIN,
+            createTapeFactory = { _ -> DefaultGradientTape() },
+        )
+
+        val rng = Random(42)
+        val model = sequential<FP32, Float>(trainCtx) {
+            input(2)
+            dense(8) { weights { randn(std = 0.5f, random = rng) } }
+            activation { it.tanh() }
+            dense(1) { weights { randn(std = 0.5f, random = rng) } }
+            activation { it.tanh() }
+        }
+
+        val x = baseCtx.fromFloatArray<FP32, Float>(Shape(n, 2), FP32::class, featuresFlat)
+        val y = baseCtx.fromFloatArray<FP32, Float>(Shape(n, 1), FP32::class, labelsFlat)
+        // end::setup[]
+
+        // tag::loop[]
+        val runner = training<FP32, Float> {
+            model { model }
+            loss { MSELoss() }
+            optimizer {
+                sgd(lr = 0.1).apply {
+                    model.trainableParameters().forEach { addParameter(it) }
+                }
+            }
+        }
+
+        var firstLoss = 0f
+        var lastLoss = 0f
+        repeat(150) { epoch ->
+            val loss = runner.step(trainCtx, x, y).data.get()
+            if (epoch == 0) firstLoss = loss
+            lastLoss = loss
+        }
+        // end::loop[]
+
+        // tag::accuracy[]
+        // Metric: classification accuracy on a fresh inference context.
+        val evalCtx = DirectCpuExecutionContext()
+        val preds = model.forward(x, evalCtx)
+        var correct = 0
+        for (i in 0 until n) {
+            val score = preds.data.get(i, 0)
+            val predicted = if (score >= 0f) 1f else -1f
+            if (predicted == labelsFlat[i]) correct++
+        }
+        val accuracy = correct.toFloat() / n
+        // end::accuracy[]
+
+        return Result(firstLoss, lastLoss, accuracy)
+    }
+}
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 4c0217e2..b53424c0 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -3,6 +3,7 @@
 .Using SKaiNET
 * xref:using/index.adoc[About this section]
 * Tutorials
+** xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started]
 ** xref:tutorials/java-getting-started.adoc[Java getting started]
 ** xref:tutorials/hlo-getting-started.adoc[StableHLO getting started]
 ** xref:tutorials/minerva-getting-started.adoc[Minerva getting started]
@@ -10,8 +11,10 @@
 ** xref:tutorials/turboquant-getting-started.adoc[TurboQuant: KV-cache compression]
 * How-to guides
 ** xref:how-to/build-tensors.adoc[Build tensors with the data DSL]
+** xref:how-to/tensor-ops.adoc[Apply tensor operations]
 ** xref:how-to/io-readers.adoc[Load models (GGUF, SafeTensors, ONNX)]
 ** xref:how-to/java-model-training.adoc[Train a model from Java]
+** xref:how-to/metrics-and-perf-testing.adoc[Metrics and performance testing]
 ** xref:how-to/arduino-c-codegen.adoc[Generate C for Arduino]
 ** xref:how-to/minerva-export.adoc[Export secure MCU bundles with Minerva]
 * Reference
diff --git a/docs/modules/ROOT/pages/explanation/examples/index.adoc b/docs/modules/ROOT/pages/explanation/examples/index.adoc
index 36946d7d..6d90a761 100644
--- a/docs/modules/ROOT/pages/explanation/examples/index.adoc
+++ b/docs/modules/ROOT/pages/explanation/examples/index.adoc
@@ -1,6 +1,11 @@
 = Usage Examples
 
-This section contains practical examples and usage patterns for SKaiNET operators.
+This section collects practical, runnable examples of SKaiNET in use. Every snippet
+is real code from the `skainet-docs-samples` module — compiled and executed in CI, so
+nothing here can drift from the API.
+
+For task-focused guides see the how-to section; for the shortest end-to-end path see
+xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started].
 
 [#basic-examples]
 == Basic Operations
@@ -9,49 +14,71 @@ This section contains practical examples and usage patterns for SKaiNET operator
 
 include::matmul.adoc[leveloffset=+2]
 
-=== Tensor Creation and Manipulation
+=== Tensor Creation
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=create-one]
+----
+
+See xref:how-to/build-tensors.adoc[Build tensors with the data DSL] for every
+construction form and initialization strategy.
 
-// TODO: Add tensor creation examples
-// include::tensor-creation-examples.adoc[leveloffset=+2]
+=== Tensor Operations
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=ops]
+----
 
 === Broadcasting Operations
 
-// TODO: Add broadcasting examples
-// include::broadcasting-examples.adoc[leveloffset=+2]
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=broadcast]
+----
+
+See xref:how-to/tensor-ops.adoc[Apply tensor operations] for the full discussion.
 
 [#neural-network-examples]
 == Neural Network Examples
 
-=== Layer Implementations
-
-// TODO: Add layer implementation examples
-// include::layer-examples.adoc[leveloffset=+2]
+=== Defining a model
 
-=== Training Loops
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/Quickstart.kt[tag=model]
+----
 
-// TODO: Add training loop examples
-// include::training-examples.adoc[leveloffset=+2]
+=== Running inference
 
-=== Model Architectures
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/Quickstart.kt[tag=infer]
+----
 
-// TODO: Add model architecture examples
-// include::model-examples.adoc[leveloffset=+2]
+=== Training loops
 
-[#performance-examples]
-== Performance Optimization
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=setup]
 
-=== Memory Management
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=loop]
+----
 
-// TODO: Add memory management examples
-// include::memory-examples.adoc[leveloffset=+2]
+=== Measuring accuracy
 
-=== Backend-Specific Optimizations
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=accuracy]
+----
 
-// TODO: Add backend optimization examples
-// include::backend-optimization-examples.adoc[leveloffset=+2]
+See xref:how-to/metrics-and-perf-testing.adoc[Metrics and performance testing].
 
 [#cross-references]
 == Cross-References
 
+* xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started]
+* xref:how-to/build-tensors.adoc[Build tensors with the data DSL]
+* xref:how-to/tensor-ops.adoc[Apply tensor operations]
 * xref:explanation/theory/index.adoc[Mathematical Theory]
-// Operator reference lands in a later commit of the Antora migration.
\ No newline at end of file
diff --git a/docs/modules/ROOT/pages/how-to/metrics-and-perf-testing.adoc b/docs/modules/ROOT/pages/how-to/metrics-and-perf-testing.adoc
new file mode 100644
index 00000000..0af5b8ae
--- /dev/null
+++ b/docs/modules/ROOT/pages/how-to/metrics-and-perf-testing.adoc
@@ -0,0 +1,61 @@
+= Metrics and performance testing
+:description: How to measure a model's quality (accuracy and other metrics) and how to benchmark the engine's performance — two different questions with two different toolchains.
+
+There are two distinct "how good is it?" questions in SKaiNET:
+
+* *Model quality* — does the network make correct predictions? Measured with
+  **metrics** (accuracy, error) computed from a forward pass.
+* *Engine performance* — how fast does the math run? Measured with the
+  **benchmark suite**.
+
+This page covers both and points at the deeper references for each.
+
+== Measuring model quality
+
+A metric is computed from a forward pass over held-out data. The training example
+classifies two clusters and then measures **classification accuracy** — the fraction
+of samples whose predicted label matches the truth:
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=accuracy]
+----
+
+The same shape applies to any metric: run `model.forward(x, ctx)` on an evaluation
+context, then reduce predictions against targets. The snippet is compiled and run in
+CI from `skainet-docs-samples` (`TrainingDemo.kt`); the full training loop that
+produces `model` is in xref:tutorials/kotlin-getting-started.adoc#train[Kotlin getting started].
+
+[TIP]
+====
+Always evaluate on a *fresh inference context* (`DirectCpuExecutionContext()`),
+separate from the autograd/training context, so metric computation does not record
+gradients.
+====
+
+== Benchmarking engine performance
+
+Engine performance is a separate concern with its own reproducible harness. Rather
+than ad-hoc timing in user code, SKaiNET ships an official benchmark suite:
+
+* xref:contributing/benchmarks.adoc[Engine benchmark program] — what the suite
+  measures, headline vs. secondary metrics, lanes, the result-record schema, and how
+  to reproduce a public run locally.
+* xref:contributing/matmul-kernels.adoc[Reading the matmul benchmark] — interpreting
+  the numbers for the kernel that dominates inference cost.
+* xref:contributing/register-bench-runner.adoc[Register a self-hosted bench runner] —
+  running the suite on your own hardware.
+
+=== Performance-testing practices
+
+* Pin methodology — fixed warmup/iteration counts and a stable machine; see the
+  *Methodology pinning* section of xref:contributing/benchmarks.adoc[the benchmark guide].
+* Compare against a baseline run rather than absolute numbers; hardware varies.
+* For backend-level context on where the time goes, see
+  xref:explanation/perf/jvm-cpu.adoc[JVM CPU performance] and
+  xref:explanation/perf/simd-kernels.adoc[How SIMD kernels are built].
+
+== Related
+
+* xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started] — defines and trains the model measured here.
+* xref:reference/kernel-support-matrix.adoc[Kernel × platform support] — which kernels back each op per platform.
diff --git a/docs/modules/ROOT/pages/how-to/tensor-ops.adoc b/docs/modules/ROOT/pages/how-to/tensor-ops.adoc
new file mode 100644
index 00000000..7fc8e32b
--- /dev/null
+++ b/docs/modules/ROOT/pages/how-to/tensor-ops.adoc
@@ -0,0 +1,59 @@
+= Apply tensor operations
+:description: The everyday eager tensor operations in SKaiNET — matmul, transpose, reshape, activations, and broadcasting — as extension functions you call directly on a `Tensor`.
+
+This page covers *using* tensors once you have them. To build them in the first
+place, see xref:how-to/build-tensors.adoc[Build tensors with the data DSL]; for the
+exhaustive op list, see xref:reference/operators/generated/index.adoc[Operator reference].
+
+Every snippet is compiled and executed in CI from `skainet-docs-samples`
+(`TensorBasics.kt`).
+
+== Operations are extensions on `Tensor`
+
+SKaiNET's eager ops are extension functions on `Tensor<T, V>`. There is no execution
+context to thread through each call — the tensor carries its own backend `ops`, so an
+op reads exactly like the math:
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=ops]
+----
+
+`matmul` does the `[2,3] x [3,2] -> [2,2]` product, `t()` transposes, `reshape`
+changes the view to `[4]`, and `relu()` is an elementwise activation. Binary
+operators (`+`, `-`, `*`, `/`) are overloaded for both tensor-tensor and
+tensor-scalar forms.
+
+== Broadcasting
+
+Elementwise binary ops broadcast operands with compatible shapes, following the usual
+right-aligned rules. A per-column bias of shape `[1, 3]` adds across every row of a
+`[2, 3]` matrix; a scalar broadcasts to every element:
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=broadcast]
+----
+
+== Constructing the inputs
+
+For completeness, the tensors above are built with the data DSL — one tensor, or
+several captured into `lateinit var`s:
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=create-one]
+----
+
+The initialization strategies available inside `shape(...) { ... }`:
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=init]
+----
+
+== Related
+
+* xref:how-to/build-tensors.adoc[Build tensors with the data DSL]
+* xref:reference/operators/generated/index.adoc[Operator reference] — the full, KSP-generated op surface.
+* xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started] — use these ops inside a model.
diff --git a/docs/modules/ROOT/pages/tutorials/kotlin-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/kotlin-getting-started.adoc
new file mode 100644
index 00000000..51d24099
--- /dev/null
+++ b/docs/modules/ROOT/pages/tutorials/kotlin-getting-started.adoc
@@ -0,0 +1,59 @@
+= Kotlin getting started
+:description: Define a neural network with the SKaiNET `sequential { }` DSL and run a forward pass — the shortest path from zero to a prediction in Kotlin.
+
+SKaiNET is, at heart, a set of *DSLs for tensors, networks, and models*. This
+tutorial takes the shortest path through them: build a small classifier in code
+and run one forward pass. Every snippet below is compiled and executed in CI from `skainet-docs-samples` — see
+the `skainet-docs-samples` module (`Quickstart.kt`).
+
+== Define a model
+
+A network *is* a Kotlin block. The `sequential { }` DSL stacks layers in order;
+`input(n)` declares the feature count, `dense(n)` adds a fully-connected layer, and
+each layer's `activation` is just a lambda over a `Tensor`:
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/Quickstart.kt[tag=model]
+----
+
+This is a `784 -> 128 (ReLU) -> 10 (Softmax)` MNIST-shaped classifier. No builder
+objects, no config files — the architecture reads top-to-bottom as code.
+
+== Run a forward pass
+
+Create a CPU execution context, build the model against it, wrap your input as a
+`Tensor`, and call `forward`:
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/Quickstart.kt[tag=infer]
+----
+
+The result is a `[1, 10]` tensor of class scores. For an untrained model the scores
+are meaningless — see xref:tutorials/kotlin-getting-started.adoc#train[training] below
+to make them learn.
+
+[#train]
+== Train it
+
+Training is the same spirit: a `training { }` block wires a model, a loss, and an
+optimizer, and `step` runs one forward/backward/update. This example learns to
+separate two clusters and then measures accuracy — see
+xref:how-to/metrics-and-perf-testing.adoc[Metrics and performance testing] for the
+full walkthrough, and the runnable
+`TrainingDemo.kt` in `skainet-docs-samples`.
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=setup]
+
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=loop]
+----
+
+== Next steps
+
+* xref:how-to/build-tensors.adoc[Build tensors with the data DSL] — every way to construct a `Tensor`.
+* xref:how-to/tensor-ops.adoc[Apply tensor operations] — matmul, reshape, broadcasting, activations.
+* xref:tutorials/graph-dsl.adoc[Graph DSL] — branching/multi-output models with `dag { }`.
+* xref:reference/operators/generated/index.adoc[Operator reference] — the full op surface.
diff --git a/settings.gradle.kts b/settings.gradle.kts
index 5c393aba..bbfbc825 100644
--- a/settings.gradle.kts
+++ b/settings.gradle.kts
@@ -80,3 +80,6 @@ include("skainet-apps:skainet-grayscale-cli")
 include("skainet-apps:skainet-tensor-tools")
 include("skainet-io:skainet-io-safetensors")
 include("skainet-io:skainet-io-iree-params")
+
+// ====== DOCS
+include("skainet-docs-samples")
diff --git a/skainet-docs-samples/build.gradle.kts b/skainet-docs-samples/build.gradle.kts
new file mode 100644
index 00000000..66fd4538
--- /dev/null
+++ b/skainet-docs-samples/build.gradle.kts
@@ -0,0 +1,39 @@
+import org.jetbrains.kotlin.gradle.dsl.JvmTarget
+
+/**
+ * Documentation samples module.
+ *
+ * The Kotlin sources under this module ARE the Antora example resources: the
+ * `commonMain` source directory points at `docs/modules/ROOT/examples/kotlin`, so
+ * every snippet rendered in the docs is real code that this module compiles and
+ * `commonTest` executes in CI. AsciiDoc pages pull tagged regions out of these files
+ * with `include::example$kotlin/...[tag=...]` — there are no hand-typed snippets to rot.
+ *
+ * Not published: this module exists only to keep the documentation honest.
+ */
+plugins {
+    alias(libs.plugins.kotlinMultiplatform)
+}
+
+kotlin {
+    jvm {
+        compilerOptions {
+            jvmTarget.set(JvmTarget.JVM_21)
+        }
+    }
+
+    sourceSets {
+        commonMain {
+            // The example sources live in the docs tree so Antora can include them.
+            kotlin.srcDir("../docs/modules/ROOT/examples/kotlin")
+            dependencies {
+                implementation(project(":skainet-lang:skainet-lang-core"))
+                implementation(project(":skainet-backends:skainet-backend-cpu"))
+                implementation(project(":skainet-compile:skainet-compile-dag"))
+            }
+        }
+        commonTest.dependencies {
+            implementation(libs.kotlin.test)
+        }
+    }
+}
diff --git a/skainet-docs-samples/src/commonTest/kotlin/sk/ainet/docs/samples/SamplesTest.kt b/skainet-docs-samples/src/commonTest/kotlin/sk/ainet/docs/samples/SamplesTest.kt
new file mode 100644
index 00000000..f43259b4
--- /dev/null
+++ b/skainet-docs-samples/src/commonTest/kotlin/sk/ainet/docs/samples/SamplesTest.kt
@@ -0,0 +1,45 @@
+package sk.ainet.docs.samples
+
+import sk.ainet.context.DirectCpuExecutionContext
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+/**
+ * Executes every documentation sample so the snippets included into the Antora
+ * pages are guaranteed to compile and run.
+ */
+class SamplesTest {
+
+    @Test
+    fun tensorBasics_constructs_and_computes() {
+        val ctx = DirectCpuExecutionContext.create()
+
+        val one = TensorBasics.oneTensor(ctx)
+        assertEquals(listOf(2, 2), one.shape.dimensions.toList())
+
+        assertEquals(5, TensorBasics.initStrategies(ctx).size)
+
+        val ops = TensorBasics.ops(ctx)
+        assertEquals(listOf(4), ops.shape.dimensions.toList())
+
+        val broadcast = TensorBasics.broadcast(ctx)
+        assertEquals(listOf(2, 3), broadcast.shape.dimensions.toList())
+        // first element: 1 + 10 + 100
+        assertEquals(111f, broadcast.data.get(0, 0), 1e-4f)
+    }
+
+    @Test
+    fun quickstart_forward_produces_class_scores() {
+        val pixels = FloatArray(784) { 0f }
+        val scores = Quickstart.classify(pixels)
+        assertEquals(listOf(1, 10), scores.shape.dimensions.toList())
+    }
+
+    @Test
+    fun training_demo_learns_and_classifies() {
+        val r = TrainingDemo.run()
+        assertTrue(r.lastLoss < r.firstLoss, "loss should decrease: ${r.firstLoss} -> ${r.lastLoss}")
+        assertTrue(r.accuracy >= 0.75f, "accuracy should be high on separable data, got ${r.accuracy}")
+    }
+}