SKaiNET-developers · michalharakal · Jun 10, 2026 · Jun 10, 2026
diff --git a/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/Quickstart.kt b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/Quickstart.kt
@@ -0,0 +1,50 @@
+package sk.ainet.docs.samples
+
+// tag::imports[]
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.lang.nn.dsl.sequential
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.relu
+import sk.ainet.lang.tensor.softmax
+import sk.ainet.lang.types.FP32
+// end::imports[]
+
+/**
+ * Flagship "hello world" for the SKaiNET model DSL: define a small MLP with the
+ * `sequential { }` DSL, then run a single forward pass on the CPU.
+ *
+ * Every region below is included verbatim into
+ * `tutorials/kotlin-getting-started.adoc`. The companion test
+ * (`QuickstartTest`) runs this end-to-end so the snippet can never drift.
+ */
+object Quickstart {
+
+    /**
+     * Build a `784 -> 128 (ReLU) -> 10 (Softmax)` classifier purely in code.
+     * This is the SKaiNET "spirit": the network *is* a Kotlin DSL block.
+     */
+    // tag::model[]
+    fun buildModel(ctx: DirectCpuExecutionContext) =
+        sequential<FP32, Float>(ctx) {
+            input(784)                                  // 28x28 flattened
+            dense(128) { activation = { it.relu() } }   // hidden layer
+            dense(10) { activation = { it.softmax(1) } } // class scores
+        }
+    // end::model[]
+
+    /**
+     * Run one forward pass over a batch of one sample and return the 10 class scores.
+     */
+    // tag::infer[]
+    fun classify(pixels: FloatArray): Tensor<FP32, Float> {
+        val ctx = DirectCpuExecutionContext.create()
+        val model = buildModel(ctx)
+
+        // Shape is [batch, features]; one sample here.
+        val input = ctx.fromFloatArray<FP32, Float>(Shape(1, 784), FP32::class, pixels)
+
+        return model.forward(input, ctx)               // [1, 10] class scores
+    }
+    // end::infer[]
+}
diff --git a/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TensorBasics.kt b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TensorBasics.kt
@@ -0,0 +1,88 @@
+package sk.ainet.docs.samples
+
+// tag::imports[]
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.context.data
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.dsl.tensor
+import sk.ainet.lang.tensor.matmul
+import sk.ainet.lang.tensor.plus
+import sk.ainet.lang.tensor.relu
+import sk.ainet.lang.tensor.reshape
+import sk.ainet.lang.tensor.t
+import sk.ainet.lang.types.FP32
+// end::imports[]
+
+/**
+ * Tensor construction and the everyday eager operations, all real and CI-run.
+ * Regions here are included by `how-to/tensor-ops.adoc` and the usage-examples page.
+ */
+object TensorBasics {
+
+    /** Build a single 2x2 tensor with the `data { }` DSL — the idiomatic form. */
+    // tag::create-one[]
+    fun oneTensor(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> =
+        data<FP32, Float>(ctx) {
+            tensor {
+                shape(2, 2) { from(1f, 2f, 3f, 4f) }
+            }
+        }
+    // end::create-one[]
+
+    /** The initialization strategies available inside `shape(...) { ... }`. */
+    // tag::init[]
+    fun initStrategies(ctx: DirectCpuExecutionContext): List<Tensor<FP32, Float>> {
+        lateinit var zeros: Tensor<FP32, Float>
+        lateinit var ones: Tensor<FP32, Float>
+        lateinit var filled: Tensor<FP32, Float>
+        lateinit var gaussian: Tensor<FP32, Float>
+        lateinit var ramp: Tensor<FP32, Float>
+        data(ctx) {
+            zeros = tensor { shape(2, 3) { zeros() } }
+            ones = tensor { shape(2, 3) { ones() } }
+            filled = tensor { shape(2, 3) { full(0.5f) } }
+            gaussian = tensor { shape(2, 3) { randn(mean = 0f, std = 0.02f) } }
+            ramp = tensor { shape(2, 3) { init { idx -> (idx[0] + idx[1]).toFloat() } } }
+        }
+        return listOf(zeros, ones, filled, gaussian, ramp)
+    }
+    // end::init[]
+
+    /**
+     * Everyday eager ops: matrix multiply, transpose, reshape, ReLU. Each op is an
+     * extension on `Tensor` — no execution context to thread through, the tensor
+     * carries its own backend ops.
+     */
+    // tag::ops[]
+    fun ops(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> {
+        lateinit var a: Tensor<FP32, Float>
+        lateinit var b: Tensor<FP32, Float>
+        data(ctx) {
+            a = tensor { shape(2, 3) { from(1f, 2f, 3f, 4f, 5f, 6f) } }
+            b = tensor { shape(3, 2) { from(1f, 0f, 0f, 1f, 1f, 1f) } }
+        }
+        val product = a.matmul(b)        // [2,3] x [3,2] -> [2,2]
+        val transposed = product.t()     // [2,2] -> [2,2]
+        val flat = transposed.reshape(Shape(4))
+        return flat.relu()
+    }
+    // end::ops[]
+
+    /**
+     * Broadcasting: a per-column bias of shape [1,3] is added across both rows of a
+     * [2,3] matrix. A scalar broadcasts to every element.
+     */
+    // tag::broadcast[]
+    fun broadcast(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> {
+        lateinit var matrix: Tensor<FP32, Float>
+        lateinit var bias: Tensor<FP32, Float>
+        data(ctx) {
+            matrix = tensor { shape(2, 3) { from(1f, 2f, 3f, 4f, 5f, 6f) } }
+            bias = tensor { shape(1, 3) { from(10f, 20f, 30f) } }
+        }
+        val biased = matrix + bias       // [2,3] + [1,3] -> [2,3]
+        return biased + 100f             // scalar broadcasts to every element
+    }
+    // end::broadcast[]
+}
diff --git a/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TrainingDemo.kt b/docs/modules/ROOT/examples/kotlin/sk/ainet/docs/samples/TrainingDemo.kt
@@ -0,0 +1,97 @@
+package sk.ainet.docs.samples
+
+// tag::imports[]
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.context.Phase
+import sk.ainet.lang.graph.DefaultGradientTape
+import sk.ainet.lang.graph.DefaultGraphExecutionContext
+import sk.ainet.lang.nn.dsl.sequential
+import sk.ainet.lang.nn.dsl.training
+import sk.ainet.lang.nn.loss.MSELoss
+import sk.ainet.lang.nn.optim.sgd
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.tanh
+import sk.ainet.lang.types.FP32
+import kotlin.random.Random
+// end::imports[]
+
+/**
+ * End-to-end training with the SKaiNET `training { }` DSL: a tiny two-cluster
+ * classification task learned by an MLP `[2, 8, 1]` with `tanh` activations.
+ *
+ * Demonstrates the pieces issue #102 asks to document — running a model, a loss,
+ * an optimizer, and a metric (classification accuracy) — as real, CI-run code.
+ */
+object TrainingDemo {
+
+    /** firstLoss/lastLoss show learning; accuracy is the held-out metric. */
+    data class Result(val firstLoss: Float, val lastLoss: Float, val accuracy: Float)
+
+    // Two linearly separable clusters: label +1 near (1,1), label -1 near (-1,-1).
+    private val featuresFlat = floatArrayOf(
+        1.0f, 1.1f, 0.9f, 1.2f, 1.2f, 0.8f, 1.1f, 0.9f,
+        -1.0f, -1.1f, -0.9f, -1.2f, -1.2f, -0.8f, -1.1f, -0.9f,
+    )
+    private val labelsFlat = floatArrayOf(1f, 1f, 1f, 1f, -1f, -1f, -1f, -1f)
+
+    fun run(): Result {
+        val n = labelsFlat.size
+
+        // tag::setup[]
+        // A graph (autograd) context for training; a plain CPU context for inference.
+        val baseCtx = DirectCpuExecutionContext()
+        val trainCtx = DefaultGraphExecutionContext(
+            baseOps = baseCtx.ops,
+            phase = Phase.TRAIN,
+            createTapeFactory = { _ -> DefaultGradientTape() },
+        )
+
+        val rng = Random(42)
+        val model = sequential<FP32, Float>(trainCtx) {
+            input(2)
+            dense(8) { weights { randn(std = 0.5f, random = rng) } }
+            activation { it.tanh() }
+            dense(1) { weights { randn(std = 0.5f, random = rng) } }
+            activation { it.tanh() }
+        }
+
+        val x = baseCtx.fromFloatArray<FP32, Float>(Shape(n, 2), FP32::class, featuresFlat)
+        val y = baseCtx.fromFloatArray<FP32, Float>(Shape(n, 1), FP32::class, labelsFlat)
+        // end::setup[]
+
+        // tag::loop[]
+        val runner = training<FP32, Float> {
+            model { model }
+            loss { MSELoss() }
+            optimizer {
+                sgd(lr = 0.1).apply {
+                    model.trainableParameters().forEach { addParameter(it) }
+                }
+            }
+        }
+
+        var firstLoss = 0f
+        var lastLoss = 0f
+        repeat(150) { epoch ->
+            val loss = runner.step(trainCtx, x, y).data.get()
+            if (epoch == 0) firstLoss = loss
+            lastLoss = loss
+        }
+        // end::loop[]
+
+        // tag::accuracy[]
+        // Metric: classification accuracy on a fresh inference context.
+        val evalCtx = DirectCpuExecutionContext()
+        val preds = model.forward(x, evalCtx)
+        var correct = 0
+        for (i in 0 until n) {
+            val score = preds.data.get(i, 0)
+            val predicted = if (score >= 0f) 1f else -1f
+            if (predicted == labelsFlat[i]) correct++
+        }
+        val accuracy = correct.toFloat() / n
+        // end::accuracy[]
+
+        return Result(firstLoss, lastLoss, accuracy)
+    }
+}
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
@@ -3,15 +3,18 @@
 .Using SKaiNET
 * xref:using/index.adoc[About this section]
 * Tutorials
+** xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started]
 ** xref:tutorials/java-getting-started.adoc[Java getting started]
 ** xref:tutorials/hlo-getting-started.adoc[StableHLO getting started]
 ** xref:tutorials/minerva-getting-started.adoc[Minerva getting started]
 ** xref:tutorials/graph-dsl.adoc[Graph DSL]
 ** xref:tutorials/turboquant-getting-started.adoc[TurboQuant: KV-cache compression]
 * How-to guides
 ** xref:how-to/build-tensors.adoc[Build tensors with the data DSL]
+** xref:how-to/tensor-ops.adoc[Apply tensor operations]
 ** xref:how-to/io-readers.adoc[Load models (GGUF, SafeTensors, ONNX)]
 ** xref:how-to/java-model-training.adoc[Train a model from Java]
+** xref:how-to/metrics-and-perf-testing.adoc[Metrics and performance testing]
 ** xref:how-to/arduino-c-codegen.adoc[Generate C for Arduino]
 ** xref:how-to/minerva-export.adoc[Export secure MCU bundles with Minerva]
 * Reference

diff --git a/docs/modules/ROOT/pages/explanation/examples/index.adoc b/docs/modules/ROOT/pages/explanation/examples/index.adoc
@@ -1,6 +1,11 @@
 = Usage Examples
 
-This section contains practical examples and usage patterns for SKaiNET operators.
+This section collects practical, runnable examples of SKaiNET in use. Every snippet
+is real code from the `skainet-docs-samples` module — compiled and executed in CI, so
+nothing here can drift from the API.
+
+For task-focused guides see the how-to section; for the shortest end-to-end path see
+xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started].
 
 [#basic-examples]
 == Basic Operations
@@ -9,49 +14,71 @@ This section contains practical examples and usage patterns for SKaiNET operator
 
 include::matmul.adoc[leveloffset=+2]
 
-=== Tensor Creation and Manipulation
+=== Tensor Creation
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=create-one]
+----
+
+See xref:how-to/build-tensors.adoc[Build tensors with the data DSL] for every
+construction form and initialization strategy.
 
-// TODO: Add tensor creation examples
-// include::tensor-creation-examples.adoc[leveloffset=+2]
+=== Tensor Operations
+
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=ops]
+----
 
 === Broadcasting Operations
 
-// TODO: Add broadcasting examples
-// include::broadcasting-examples.adoc[leveloffset=+2]
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TensorBasics.kt[tag=broadcast]
+----
+
+See xref:how-to/tensor-ops.adoc[Apply tensor operations] for the full discussion.
 
 [#neural-network-examples]
 == Neural Network Examples
 
-=== Layer Implementations
-
-// TODO: Add layer implementation examples
-// include::layer-examples.adoc[leveloffset=+2]
+=== Defining a model
 
-=== Training Loops
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/Quickstart.kt[tag=model]
+----
 
-// TODO: Add training loop examples
-// include::training-examples.adoc[leveloffset=+2]
+=== Running inference
 
-=== Model Architectures
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/Quickstart.kt[tag=infer]
+----
 
-// TODO: Add model architecture examples
-// include::model-examples.adoc[leveloffset=+2]
+=== Training loops
 
-[#performance-examples]
-== Performance Optimization
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=setup]
 
-=== Memory Management
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=loop]
+----
 
-// TODO: Add memory management examples
-// include::memory-examples.adoc[leveloffset=+2]
+=== Measuring accuracy
 
-=== Backend-Specific Optimizations
+[source,kotlin]
+----
+include::example$kotlin/sk/ainet/docs/samples/TrainingDemo.kt[tag=accuracy]
+----
 
-// TODO: Add backend optimization examples
-// include::backend-optimization-examples.adoc[leveloffset=+2]
+See xref:how-to/metrics-and-perf-testing.adoc[Metrics and performance testing].
 
 [#cross-references]
 == Cross-References
 
+* xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started]
+* xref:how-to/build-tensors.adoc[Build tensors with the data DSL]
+* xref:how-to/tensor-ops.adoc[Apply tensor operations]
 * xref:explanation/theory/index.adoc[Mathematical Theory]
-// Operator reference lands in a later commit of the Antora migration.