datastax · jshook · Oct 9, 2025 · Oct 10, 2025 · Oct 10, 2025
@@ -94,6 +94,21 @@
                     </execution>
                 </executions>
             </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-javadoc-plugin</artifactId>
+                <configuration>
+                    <additionalJOptions>
+                        <additionalJOption>--add-modules=jdk.incubator.vector</additionalJOption>
+                    </additionalJOptions>
+                    <release>22</release>
+                    <detectOfflineLinks>false</detectOfflineLinks>
+                    <includeDependencySources>true</includeDependencySources>
+                    <dependencySourceIncludes>
+                        <dependencySourceInclude>io.github.jbellis:*</dependencySourceInclude>
+                    </dependencySourceIncludes>
+                </configuration>
+            </plugin>
         </plugins>
     </build>
 </project>
@@ -37,7 +37,23 @@
 
 import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
 
-
+/**
+ * JMH benchmark for measuring graph index construction performance using randomly generated vectors.
+ * This benchmark evaluates the time required to build a graph index with configurable parameters
+ * including vector dimensionality, dataset size, and optional Product Quantization (PQ) compression.
+ *
+ * <p>The benchmark tests various configurations to assess how different factors affect index
+ * construction time, including the impact of using PQ compression during the build process.</p>
+ *
+ * <p>Key parameters:</p>
+ * <ul>
+ *   <li>Vector dimensionality: 768 or 1536 dimensions</li>
+ *   <li>Dataset size: 100,000 vectors</li>
+ *   <li>PQ subspaces: 0 (no compression) or 16 subspaces</li>
+ *   <li>Graph degree (M): 32 neighbors per node</li>
+ *   <li>Beam width: 100 for construction search</li>
+ * </ul>
+ */
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 @State(Scope.Thread)
@@ -48,17 +64,45 @@
 public class IndexConstructionWithRandomSetBenchmark {
     private static final Logger log = LoggerFactory.getLogger(IndexConstructionWithRandomSetBenchmark.class);
     private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport();
+
+    /** The vector values to be indexed, initialized during setup. */
     private RandomAccessVectorValues ravv;
+
+    /** The score provider used during graph construction, either exact or PQ-based. */
     private BuildScoreProvider buildScoreProvider;
-    private int M = 32; // graph degree
+
+    /** The maximum degree of the graph (number of neighbors per node). */
+    private int M = 32;
+
+    /** The beam width used during graph construction searches. */
     private int beamWidth = 100;
+
+    /** The dimensionality of vectors being indexed. */
     @Param({"768", "1536"})
     private int originalDimension;
+
+    /** The number of vectors in the dataset to be indexed. */
     @Param({/*"10000",*/ "100000"/*, "1000000"*/})
     int numBaseVectors;
+
+    /** The number of PQ subspaces to use, or 0 for no compression. */
     @Param({"0", "16"})
     private int numberOfPQSubspaces;
 
+    /**
+     * Constructs a new benchmark instance. JMH will instantiate this class
+     * and populate the @Param fields before calling setup methods.
+     */
+    public IndexConstructionWithRandomSetBenchmark() {
+        // JMH-managed lifecycle
+    }
+
+    /**
+     * Initializes the benchmark state by generating random vectors and configuring
+     * the appropriate score provider based on whether PQ compression is enabled.
+     *
+     * @throws IOException if an error occurs during setup
+     */
     @Setup(Level.Trial)
     public void setup() throws IOException {
 
@@ -86,11 +130,25 @@ public void setup() throws IOException {
 
     }
 
+    /**
+     * Tears down resources after each benchmark invocation.
+     * Currently performs no operations but is included for future resource cleanup needs.
+     *
+     * @throws IOException if an error occurs during teardown
+     */
     @TearDown(Level.Invocation)
     public void tearDown() throws IOException {
 
     }
 
+    /**
+     * The main benchmark method that measures the time to build a graph index.
+     * Constructs a complete graph index from the configured vectors using the
+     * specified parameters and score provider.
+     *
+     * @param blackhole JMH blackhole to prevent dead code elimination
+     * @throws IOException if an error occurs during index construction
+     */
     @Benchmark
     public void buildIndexBenchmark(Blackhole blackhole) throws IOException {
         // score provider using the raw, in-memory vectors
@@ -100,6 +158,13 @@ public void buildIndexBenchmark(Blackhole blackhole) throws IOException {
         }
     }
 
+    /**
+     * Creates a random vector with the specified dimensionality.
+     * Each component is randomly generated using {@link Math#random()}.
+     *
+     * @param dimension the number of dimensions in the vector
+     * @return a newly created random vector
+     */
     private VectorFloat<?> createRandomVector(int dimension) {
         VectorFloat<?> vector = VECTOR_TYPE_SUPPORT.createFloatVector(dimension);
         for (int i = 0; i < dimension; i++) {

@@ -31,6 +31,23 @@
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 
+/**
+ * JMH benchmark for measuring graph index construction performance using the SIFT dataset.
+ * This benchmark evaluates index construction time with a fixed, real-world dataset,
+ * testing various combinations of graph degree (M) and beam width parameters.
+ *
+ * <p>Unlike {@link IndexConstructionWithRandomSetBenchmark}, this benchmark uses the
+ * actual SIFT dataset loaded from disk, providing more realistic performance measurements
+ * that account for real data characteristics.</p>
+ *
+ * <p>Key parameters:</p>
+ * <ul>
+ *   <li>Graph degree (M): 16, 32, or 64 neighbors per node</li>
+ *   <li>Beam width: 10 or 100 for construction search</li>
+ *   <li>Dataset: SIFT small dataset (10,000 vectors, 128 dimensions)</li>
+ *   <li>Similarity function: Euclidean distance</li>
+ * </ul>
+ */
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 @State(Scope.Thread)
@@ -40,17 +57,47 @@
 @Threads(1)
 public class IndexConstructionWithStaticSetBenchmark {
     private static final Logger log = LoggerFactory.getLogger(IndexConstructionWithStaticSetBenchmark.class);
+
+    /** The vector values to be indexed, loaded from the SIFT dataset. */
     private RandomAccessVectorValues ravv;
+
+    /** The base vectors from the SIFT dataset. */
     private List<VectorFloat<?>> baseVectors;
+
+    /** The query vectors from the SIFT dataset (loaded but not used in this benchmark). */
     private List<VectorFloat<?>> queryVectors;
+
+    /** The ground truth nearest neighbors (loaded but not used in this benchmark). */
     private List<List<Integer>> groundTruth;
+
+    /** The score provider used during graph construction. */
     private BuildScoreProvider bsp;
+
+    /** The maximum degree of the graph (number of neighbors per node). */
     @Param({"16", "32", "64"})
-    private int M; // graph degree
+    private int M;
+
+    /** The beam width used during graph construction searches. */
     @Param({"10", "100"})
     private int beamWidth;
+
+    /** The dimensionality of vectors in the dataset. */
     int originalDimension;
 
+    /**
+     * Constructs a new benchmark instance. JMH will instantiate this class
+     * and populate the @Param fields before calling setup methods.
+     */
+    public IndexConstructionWithStaticSetBenchmark() {
+        // JMH-managed lifecycle
+    }
+
+    /**
+     * Initializes the benchmark state by loading the SIFT dataset from disk
+     * and configuring the score provider.
+     *
+     * @throws IOException if an error occurs loading the dataset files
+     */
     @Setup
     public void setup() throws IOException {
         var siftPath = "siftsmall";
@@ -67,13 +114,25 @@ public void setup() throws IOException {
         bsp = BuildScoreProvider.randomAccessScoreProvider(ravv, VectorSimilarityFunction.EUCLIDEAN);
     }
 
+    /**
+     * Cleans up resources after the benchmark completes by clearing all vector collections.
+     *
+     * @throws IOException if an error occurs during teardown
+     */
     @TearDown
     public void tearDown() throws IOException {
         baseVectors.clear();
         queryVectors.clear();
         groundTruth.clear();
     }
 
+    /**
+     * The main benchmark method that measures the time to build a graph index
+     * from the loaded SIFT dataset using the configured parameters.
+     *
+     * @param blackhole JMH blackhole to prevent dead code elimination
+     * @throws IOException if an error occurs during index construction
+     */
     @Benchmark
     public void buildIndexBenchmark(Blackhole blackhole) throws IOException {
         // score provider using the raw, in-memory vectors

@@ -50,28 +50,76 @@
 @Threads(1)
 public class PQDistanceCalculationBenchmark {
     private static final Logger log = LoggerFactory.getLogger(PQDistanceCalculationBenchmark.class);
+
+    /**
+     * Creates a new benchmark instance.
+     * <p>
+     * This constructor is invoked by JMH and should not be called directly.
+     */
+    public PQDistanceCalculationBenchmark() {
+    }
     private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport();
     private final VectorSimilarityFunction vsf = VectorSimilarityFunction.EUCLIDEAN;
 
+    /** The base vectors used for distance calculations. */
     private List<VectorFloat<?>> vectors;
+
+    /** Product-quantized versions of the base vectors, or null if M=0. */
     private PQVectors pqVectors;
+
+    /** Query vectors used to test distance calculations. */
     private List<VectorFloat<?>> queryVectors;
+
+    /** The Product Quantization model, or null if M=0. */
     private ProductQuantization pq;
+
+    /** Score provider configured for either full precision or PQ-based scoring. */
     private BuildScoreProvider buildScoreProvider;
-
+
+    /**
+     * The dimensionality of the vectors.
+     * <p>
+     * Default value: 1536 (typical for modern embedding models).
+     */
     @Param({"1536"})
     private int dimension;
-
+
+    /**
+     * The number of base vectors to create for the dataset.
+     * <p>
+     * Default value: 10000
+     */
     @Param({"10000"})
     private int vectorCount;
-
+
+    /**
+     * The number of query vectors to test against the dataset.
+     * <p>
+     * Default value: 100
+     */
     @Param({"100"})
     private int queryCount;
-
+
+    /**
+     * The number of subspaces for Product Quantization.
+     * <p>
+     * When M=0, uses full precision vectors without quantization.
+     * When M&gt;0, splits each vector into M subspaces for compression.
+     * Values: 0 (no PQ), 16, 64, 192
+     */
     @Param({"0", "16", "64", "192"})
-    private int M; // Number of subspaces for PQ
+    private int M;
 
 
+    /**
+     * Sets up the benchmark by creating random vectors and configuring score providers.
+     * <p>
+     * This method creates the specified number of base vectors and query vectors with random
+     * values. If M&gt;0, it also computes Product Quantization and creates PQ-encoded vectors.
+     * The appropriate score provider is then configured based on whether PQ is used.
+     *
+     * @throws IOException if there is an error during setup
+     */
     @Setup
     public void setup() throws IOException {
         log.info("Creating dataset with dimension: {}, vector count: {}, query count: {}", dimension, vectorCount, queryCount);
@@ -100,6 +148,16 @@ public void setup() throws IOException {
         log.info("Created dataset with dimension: {}, vector count: {}, query count: {}", dimension, vectorCount, queryCount);
     }
 
+    /**
+     * Benchmarks distance calculation using cached search score providers.
+     * <p>
+     * This benchmark measures the performance of calculating distances between query vectors
+     * and all base vectors using a search score provider that caches precomputed values for
+     * the query vector. This represents the typical search scenario where a query is compared
+     * against many candidates.
+     *
+     * @param blackhole JMH blackhole to prevent dead code elimination
+     */
     @Benchmark
     public void cachedDistanceCalculation(Blackhole blackhole) {
         float totalSimilarity = 0;
@@ -115,6 +173,16 @@ public void cachedDistanceCalculation(Blackhole blackhole) {
         blackhole.consume(totalSimilarity);
     }
 
+    /**
+     * Benchmarks distance calculation for diversity scoring.
+     * <p>
+     * This benchmark measures the performance of calculating distances between base vectors
+     * using diversity score providers. This represents the scenario where vectors in the
+     * dataset are compared against each other to assess diversity, such as during graph
+     * construction or result reranking.
+     *
+     * @param blackhole JMH blackhole to prevent dead code elimination
+     */
     @Benchmark
     public void diversityCalculation(Blackhole blackhole) {
         float totalSimilarity = 0;
@@ -130,6 +198,15 @@ public void diversityCalculation(Blackhole blackhole) {
         blackhole.consume(totalSimilarity);
     }
 
+    /**
+     * Creates a random vector with the specified dimension.
+     * <p>
+     * Each component of the vector is assigned a random floating-point value
+     * between 0.0 (inclusive) and 1.0 (exclusive).
+     *
+     * @param dimension the number of dimensions for the vector
+     * @return a new random vector
+     */
     private VectorFloat<?> createRandomVector(int dimension) {
         VectorFloat<?> vector = VECTOR_TYPE_SUPPORT.createFloatVector(dimension);
         for (int i = 0; i < dimension; i++) {