diff --git a/pom.xml b/pom.xml index d05e0fa..8f72be2 100644 --- a/pom.xml +++ b/pom.xml @@ -137,6 +137,32 @@ + + com.diffplug.spotless + spotless-maven-plugin + 2.44.5 + + + + apply + + validate + + + + + + src/**/*.java + + + 1.27.0 + + true + false + + + + org.apache.maven.plugins maven-surefire-plugin diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/BenchmarkConfiguration.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/BenchmarkConfiguration.java index f39caad..cdc92bc 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/BenchmarkConfiguration.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/BenchmarkConfiguration.java @@ -21,7 +21,7 @@ public class BenchmarkConfiguration { public boolean saveResultsOnDisk; public String resultsDirectory; public boolean hasColNames; - public String algoToRun; // keep as String + public String algoToRun; // keep as String public String groundTruthFile; public String cuvsIndexDirPath; public String hnswIndexDirPath; @@ -29,22 +29,26 @@ public class BenchmarkConfiguration { public boolean skipIndexing; public int forceMerge; public boolean enableTieredMerge; + public boolean enableIndexWriterInfoStream; + public int ramBufferSizeMB; // Lucene HNSW parameters - public int hnswMaxConn; // 16 default (max 512) - public int hnswBeamWidth; // 100 default (max 3200) + public int hnswMaxConn; // 16 default (max 512) + public int hnswBeamWidth; // 100 default (max 3200) + public int hnswMergeThreads; // CAGRA parameters public int cagraIntermediateGraphDegree; // 128 default - public int cagraGraphDegree; // 64 default + public int cagraGraphDegree; // 64 default public int cagraITopK; public int cagraSearchWidth; - public int cagraHnswLayers; // layers in CAGRA->HNSW conversion + public int cagraHnswLayers; // layers in CAGRA->HNSW conversion public int efSearch; private boolean isLucene() { return "LUCENE_HNSW".equalsIgnoreCase(algoToRun); } + private boolean isCagra() { return "CAGRA_HNSW".equalsIgnoreCase(algoToRun); } @@ -77,12 +81,22 @@ public String prettyString() { sb.append("Has column names in the dataset file: ").append(hasColNames).append('\n'); sb.append("algoToRun {Choices: HNSW | CAGRA}: ").append(algoToRun).append('\n'); sb.append("Ground Truth file used is: ").append(groundTruthFile).append('\n'); - if (cuvsIndexDirPath != null) sb.append("CuVS index directory path is: ").append(cuvsIndexDirPath).append('\n'); - if (hnswIndexDirPath != null) sb.append("HNSW index directory path is: ").append(hnswIndexDirPath).append('\n'); + if (cuvsIndexDirPath != null) + sb.append("CuVS index directory path is: ").append(cuvsIndexDirPath).append('\n'); + if (hnswIndexDirPath != null) + sb.append("HNSW index directory path is: ").append(hnswIndexDirPath).append('\n'); sb.append("Load vectors in memory before indexing: ").append(loadVectorsInMemory).append('\n'); - sb.append("Skip indexing (and use existing index for search): ").append(skipIndexing).append('\n'); - sb.append("Do force merge while indexing documents [a value < 1 implies no force merge]: ").append(forceMerge).append('\n'); - + sb.append("Skip indexing (and use existing index for search): ") + .append(skipIndexing) + .append('\n'); + sb.append("Do force merge while indexing documents [a value < 1 implies no force merge]: ") + .append(forceMerge) + .append('\n'); + sb.append("Enable TieredMerge: ").append(enableTieredMerge).append('\n'); + sb.append("Num merge threads: ").append(hnswMergeThreads).append('\n'); + sb.append("enableIndexWriterInfoStream: ").append(enableIndexWriterInfoStream).append('\n'); + sb.append("ramBufferSizeMB: ").append(ramBufferSizeMB).append('\n'); + sb.append("------- algo parameters ------\n"); if (isLucene()) { sb.append("hnswMaxConn: ").append(hnswMaxConn).append('\n'); @@ -98,7 +112,10 @@ public String prettyString() { return sb.toString(); } - @Override public String toString() { return prettyString(); } + @Override + public String toString() { + return prettyString(); + } public void debugPrintArguments() { // keep a single source of truth for printing diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/FBIvecsReader.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/FBIvecsReader.java index 4f887bf..9b057a0 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/FBIvecsReader.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/FBIvecsReader.java @@ -8,12 +8,11 @@ import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; - -import org.mapdb.IndexTreeList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -//TODO: The three static methods have a lot of common logic, ideally should be combined as just one. +// TODO: The three static methods have a lot of common logic, ideally should be combined as just +// one. public class FBIvecsReader { private static final Logger log = LoggerFactory.getLogger(FBIvecsReader.class.getName()); @@ -176,92 +175,91 @@ public static void readBvecs(String filePath, int numRows, List vectors } // New method to read .fbin files (format: num_vectors, dimension, then vector data) - // Corrected readFbin method for Wiki-88M .fbin files -public static void readFbin(String filePath, int numRows, List vectors) { - log.info("Reading {} from file: {}", numRows, filePath); - - try (InputStream is = new FileInputStream(filePath)) { - // Read num_vectors (first 4 bytes, little endian) - byte[] numVecBytes = is.readNBytes(4); - ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN); - int numVectors = numVecBuffer.getInt(); - - // Read dimension (next 4 bytes, little endian) - byte[] dimBytes = is.readNBytes(4); - ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN); - int dimension = dimBuffer.getInt(); - - log.info("File header - total vectors: {}, dimension: {}", numVectors, dimension); - - float[] row = new float[dimension]; - int count = 0; - - while (is.available() != 0) { - byte[] vectorBytes = is.readNBytes(dimension * 4); - if (vectorBytes.length != dimension * 4) break; - ByteBuffer bb = ByteBuffer.wrap(vectorBytes).order(ByteOrder.LITTLE_ENDIAN); - for (int i = 0; i < dimension; i++) row[i] = bb.getFloat(); - vectors.add(row.clone()); - count++; - if (numRows != -1 && count == numRows) break; - if (count % 1000 == 0) System.out.print("."); + // Corrected readFbin method for Wiki-88M .fbin files + public static void readFbin(String filePath, int numRows, List vectors) { + log.info("Reading {} from file: {}", numRows, filePath); + + try (InputStream is = new FileInputStream(filePath)) { + // Read num_vectors (first 4 bytes, little endian) + byte[] numVecBytes = is.readNBytes(4); + ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN); + int numVectors = numVecBuffer.getInt(); + + // Read dimension (next 4 bytes, little endian) + byte[] dimBytes = is.readNBytes(4); + ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN); + int dimension = dimBuffer.getInt(); + + log.info("File header - total vectors: {}, dimension: {}", numVectors, dimension); + + float[] row = new float[dimension]; + int count = 0; + + while (is.available() != 0) { + byte[] vectorBytes = is.readNBytes(dimension * 4); + if (vectorBytes.length != dimension * 4) break; + ByteBuffer bb = ByteBuffer.wrap(vectorBytes).order(ByteOrder.LITTLE_ENDIAN); + for (int i = 0; i < dimension; i++) row[i] = bb.getFloat(); + vectors.add(row.clone()); + count++; + if (numRows != -1 && count == numRows) break; + if (count % 1000 == 0) System.out.print("."); + } + System.out.println(); + log.info("Reading complete. Read {} vectors out of {} in file.", count, numVectors); + } catch (Exception e) { + log.error("Error reading fbin file", e); } - System.out.println(); - log.info("Reading complete. Read {} vectors out of {} in file.", count, numVectors); - } catch (Exception e) { - log.error("Error reading fbin file", e); } -} -// Fixed method to read .ibin files (ground truth neighbors) -public static ArrayList readIbin(String filePath, int numRows) { - log.info("Reading {} from file: {}", numRows, filePath); - ArrayList vectors = new ArrayList(); + // Fixed method to read .ibin files (ground truth neighbors) + public static ArrayList readIbin(String filePath, int numRows) { + log.info("Reading {} from file: {}", numRows, filePath); + ArrayList vectors = new ArrayList(); - try { - InputStream is = new FileInputStream(filePath); + try { + InputStream is = new FileInputStream(filePath); - // For .ibin ground truth files: Read num_vectors first, then dimension - byte[] numVecBytes = is.readNBytes(4); - ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN); - int numVectors = numVecBuffer.getInt(); + // For .ibin ground truth files: Read num_vectors first, then dimension + byte[] numVecBytes = is.readNBytes(4); + ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN); + int numVectors = numVecBuffer.getInt(); - byte[] dimBytes = is.readNBytes(4); - ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN); - int dimension = dimBuffer.getInt(); + byte[] dimBytes = is.readNBytes(4); + ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN); + int dimension = dimBuffer.getInt(); - log.info("Ground truth file - total vectors: {}, dimension: {}", numVectors, dimension); + log.info("Ground truth file - total vectors: {}, dimension: {}", numVectors, dimension); - int count = 0; - while (is.available() != 0 && (numRows == -1 || count < numRows)) { - // Read dimension * 4 bytes (int values) - byte[] vectorBytes = is.readNBytes(dimension * 4); - if (vectorBytes.length != dimension * 4) { - break; // End of file - } + int count = 0; + while (is.available() != 0 && (numRows == -1 || count < numRows)) { + // Read dimension * 4 bytes (int values) + byte[] vectorBytes = is.readNBytes(dimension * 4); + if (vectorBytes.length != dimension * 4) { + break; // End of file + } - ByteBuffer bb = ByteBuffer.wrap(vectorBytes); - bb.order(ByteOrder.LITTLE_ENDIAN); + ByteBuffer bb = ByteBuffer.wrap(vectorBytes); + bb.order(ByteOrder.LITTLE_ENDIAN); - int[] row = new int[dimension]; - for (int i = 0; i < dimension; i++) { - row[i] = bb.getInt(); - } + int[] row = new int[dimension]; + for (int i = 0; i < dimension; i++) { + row[i] = bb.getInt(); + } - vectors.add(row); - count++; + vectors.add(row); + count++; - if (count % 1000 == 0) { - System.out.print("."); + if (count % 1000 == 0) { + System.out.print("."); + } } + System.out.println(); + is.close(); + log.info("Reading complete. Read {} vectors out of {} total.", count, numVectors); + } catch (Exception e) { + e.printStackTrace(); } - System.out.println(); - is.close(); - log.info("Reading complete. Read {} vectors out of {} total.", count, numVectors); - } catch (Exception e) { - e.printStackTrace(); + return vectors; } - return vectors; -} - } diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/FvecsStreamingVectorProvider.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/FvecsStreamingVectorProvider.java index d6c1a68..fad109f 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/FvecsStreamingVectorProvider.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/FvecsStreamingVectorProvider.java @@ -1,192 +1,207 @@ -package com.searchscale.lucene.cuvs.benchmarks; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.zip.GZIPInputStream; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Streams vectors directly from .fvecs files without creating intermediate MapDB files - */ -public class FvecsStreamingVectorProvider implements VectorProvider { - private static final Logger log = LoggerFactory.getLogger(FvecsStreamingVectorProvider.class.getName()); - - private final String filePath; - private final int dimension; - private final int vectorCount; - private final long vectorSize; // size of one vector in bytes (4 + 4*dimension) - private final boolean isCompressed; - - public FvecsStreamingVectorProvider(String filePath, int maxVectors) throws IOException { - this.filePath = filePath; - this.isCompressed = filePath.endsWith(".gz"); - - // Read dimension from first vector - try (FileInputStream fis = new FileInputStream(filePath)) { - java.io.InputStream is = isCompressed ? new GZIPInputStream(fis) : fis; - - byte[] dimBytes = new byte[4]; - is.read(dimBytes); - ByteBuffer bb = ByteBuffer.wrap(dimBytes); - bb.order(ByteOrder.LITTLE_ENDIAN); - this.dimension = bb.getInt(); - - log.info("Detected dimension: {} from file: {}", dimension, filePath); - - this.vectorSize = 4 + 4L * dimension; // dimension int + dimension floats - } - - // Calculate total vector count - if (isCompressed) { - // For compressed files, we need to read through to count - this.vectorCount = countVectorsInCompressedFile(maxVectors); - } else { - // For uncompressed files, we can calculate from file size - try (RandomAccessFile raf = new RandomAccessFile(filePath, "r")) { - long fileSize = raf.length(); - long totalVectors = fileSize / vectorSize; - this.vectorCount = maxVectors > 0 ? Math.min((int)totalVectors, maxVectors) : (int)totalVectors; - } - } - - log.info("FvecsStreamingVectorProvider initialized: {} vectors, {} dimensions", vectorCount, dimension); - } - - private int countVectorsInCompressedFile(int maxVectors) throws IOException { - int count = 0; - try (FileInputStream fis = new FileInputStream(filePath); - GZIPInputStream gzis = new GZIPInputStream(fis)) { - - // Skip dimension of first vector (already read) - gzis.skip(4L * dimension); - - byte[] buffer = new byte[4096]; - long bytesPerVector = vectorSize; - long totalBytesRead = 4 + 4L * dimension; // dimension int + first vector data - - while (gzis.available() > 0 && (maxVectors <= 0 || count < maxVectors - 1)) { - int bytesRead = gzis.read(buffer); - if (bytesRead == -1) break; - - totalBytesRead += bytesRead; - count = (int)(totalBytesRead / bytesPerVector); - - if (maxVectors > 0 && count >= maxVectors) { - count = maxVectors; - break; - } - } - } - return Math.max(1, count); // At least 1 vector (the first one we read dimension from) - } - - @Override - public float[] get(int index) throws IOException { - if (index < 0 || index >= vectorCount) { - throw new IndexOutOfBoundsException("Index " + index + " out of bounds [0, " + vectorCount + ")"); - } - - if (isCompressed) { - return getFromCompressedFile(index); - } else { - return getFromUncompressedFile(index); - } - } - - private float[] getFromUncompressedFile(int index) throws IOException { - try (RandomAccessFile raf = new RandomAccessFile(filePath, "r"); - FileChannel channel = raf.getChannel()) { - - long position = index * vectorSize; - channel.position(position); - - ByteBuffer buffer = ByteBuffer.allocate((int)vectorSize); - buffer.order(ByteOrder.LITTLE_ENDIAN); - - channel.read(buffer); - buffer.flip(); - - // Read and verify dimension - int fileDimension = buffer.getInt(); - if (fileDimension != dimension) { - throw new IOException("Dimension mismatch at vector " + index + - ": expected " + dimension + ", got " + fileDimension); - } - - // Read vector data - float[] vector = new float[dimension]; - for (int i = 0; i < dimension; i++) { - vector[i] = buffer.getFloat(); - } - - return vector; - } - } - - private float[] getFromCompressedFile(int index) throws IOException { - // For compressed files, we need to read sequentially from the beginning - // This is less efficient but necessary for compressed streams - try (FileInputStream fis = new FileInputStream(filePath); - GZIPInputStream gzis = new GZIPInputStream(fis)) { - - ByteBuffer dimBuffer = ByteBuffer.allocate(4); - dimBuffer.order(ByteOrder.LITTLE_ENDIAN); - - // Read through vectors until we reach the desired index - for (int i = 0; i <= index; i++) { - // Read dimension - dimBuffer.clear(); - if (gzis.read(dimBuffer.array()) != 4) { - throw new IOException("Unexpected end of file reading vector " + i); - } - - int fileDimension = dimBuffer.getInt(); - if (fileDimension != dimension) { - throw new IOException("Dimension mismatch at vector " + i + - ": expected " + dimension + ", got " + fileDimension); - } - - // Read vector data - byte[] vectorBytes = new byte[dimension * 4]; - if (gzis.read(vectorBytes) != vectorBytes.length) { - throw new IOException("Unexpected end of file reading vector data for vector " + i); - } - - if (i == index) { - // This is the vector we want - ByteBuffer vectorBuffer = ByteBuffer.wrap(vectorBytes); - vectorBuffer.order(ByteOrder.LITTLE_ENDIAN); - - float[] vector = new float[dimension]; - for (int j = 0; j < dimension; j++) { - vector[j] = vectorBuffer.getFloat(); - } - return vector; - } - // Otherwise, we just skip this vector and continue - } - - throw new IOException("Could not read vector " + index); - } - } - - @Override - public int size() { - return vectorCount; - } - - @Override - public void close() throws IOException { - // No resources to close for file-based streaming - } - - public int getDimension() { - return dimension; - } -} \ No newline at end of file +package com.searchscale.lucene.cuvs.benchmarks; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.zip.GZIPInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Streams vectors directly from .fvecs files without creating intermediate MapDB files + */ +public class FvecsStreamingVectorProvider implements VectorProvider { + private static final Logger log = + LoggerFactory.getLogger(FvecsStreamingVectorProvider.class.getName()); + + private final String filePath; + private final int dimension; + private final int vectorCount; + private final long vectorSize; // size of one vector in bytes (4 + 4*dimension) + private final boolean isCompressed; + + public FvecsStreamingVectorProvider(String filePath, int maxVectors) throws IOException { + this.filePath = filePath; + this.isCompressed = filePath.endsWith(".gz"); + + // Read dimension from first vector + try (FileInputStream fis = new FileInputStream(filePath)) { + java.io.InputStream is = isCompressed ? new GZIPInputStream(fis) : fis; + + byte[] dimBytes = new byte[4]; + is.read(dimBytes); + ByteBuffer bb = ByteBuffer.wrap(dimBytes); + bb.order(ByteOrder.LITTLE_ENDIAN); + this.dimension = bb.getInt(); + + log.info("Detected dimension: {} from file: {}", dimension, filePath); + + this.vectorSize = 4 + 4L * dimension; // dimension int + dimension floats + } + + // Calculate total vector count + if (isCompressed) { + // For compressed files, we need to read through to count + this.vectorCount = countVectorsInCompressedFile(maxVectors); + } else { + // For uncompressed files, we can calculate from file size + try (RandomAccessFile raf = new RandomAccessFile(filePath, "r")) { + long fileSize = raf.length(); + long totalVectors = fileSize / vectorSize; + this.vectorCount = + maxVectors > 0 ? Math.min((int) totalVectors, maxVectors) : (int) totalVectors; + } + } + + log.info( + "FvecsStreamingVectorProvider initialized: {} vectors, {} dimensions", + vectorCount, + dimension); + } + + private int countVectorsInCompressedFile(int maxVectors) throws IOException { + int count = 0; + try (FileInputStream fis = new FileInputStream(filePath); + GZIPInputStream gzis = new GZIPInputStream(fis)) { + + // Skip dimension of first vector (already read) + gzis.skip(4L * dimension); + + byte[] buffer = new byte[4096]; + long bytesPerVector = vectorSize; + long totalBytesRead = 4 + 4L * dimension; // dimension int + first vector data + + while (gzis.available() > 0 && (maxVectors <= 0 || count < maxVectors - 1)) { + int bytesRead = gzis.read(buffer); + if (bytesRead == -1) break; + + totalBytesRead += bytesRead; + count = (int) (totalBytesRead / bytesPerVector); + + if (maxVectors > 0 && count >= maxVectors) { + count = maxVectors; + break; + } + } + } + return Math.max(1, count); // At least 1 vector (the first one we read dimension from) + } + + @Override + public float[] get(int index) throws IOException { + if (index < 0 || index >= vectorCount) { + throw new IndexOutOfBoundsException( + "Index " + index + " out of bounds [0, " + vectorCount + ")"); + } + + if (isCompressed) { + return getFromCompressedFile(index); + } else { + return getFromUncompressedFile(index); + } + } + + private float[] getFromUncompressedFile(int index) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(filePath, "r"); + FileChannel channel = raf.getChannel()) { + + long position = index * vectorSize; + channel.position(position); + + ByteBuffer buffer = ByteBuffer.allocate((int) vectorSize); + buffer.order(ByteOrder.LITTLE_ENDIAN); + + channel.read(buffer); + buffer.flip(); + + // Read and verify dimension + int fileDimension = buffer.getInt(); + if (fileDimension != dimension) { + throw new IOException( + "Dimension mismatch at vector " + + index + + ": expected " + + dimension + + ", got " + + fileDimension); + } + + // Read vector data + float[] vector = new float[dimension]; + for (int i = 0; i < dimension; i++) { + vector[i] = buffer.getFloat(); + } + + return vector; + } + } + + private float[] getFromCompressedFile(int index) throws IOException { + // For compressed files, we need to read sequentially from the beginning + // This is less efficient but necessary for compressed streams + try (FileInputStream fis = new FileInputStream(filePath); + GZIPInputStream gzis = new GZIPInputStream(fis)) { + + ByteBuffer dimBuffer = ByteBuffer.allocate(4); + dimBuffer.order(ByteOrder.LITTLE_ENDIAN); + + // Read through vectors until we reach the desired index + for (int i = 0; i <= index; i++) { + // Read dimension + dimBuffer.clear(); + if (gzis.read(dimBuffer.array()) != 4) { + throw new IOException("Unexpected end of file reading vector " + i); + } + + int fileDimension = dimBuffer.getInt(); + if (fileDimension != dimension) { + throw new IOException( + "Dimension mismatch at vector " + + i + + ": expected " + + dimension + + ", got " + + fileDimension); + } + + // Read vector data + byte[] vectorBytes = new byte[dimension * 4]; + if (gzis.read(vectorBytes) != vectorBytes.length) { + throw new IOException("Unexpected end of file reading vector data for vector " + i); + } + + if (i == index) { + // This is the vector we want + ByteBuffer vectorBuffer = ByteBuffer.wrap(vectorBytes); + vectorBuffer.order(ByteOrder.LITTLE_ENDIAN); + + float[] vector = new float[dimension]; + for (int j = 0; j < dimension; j++) { + vector[j] = vectorBuffer.getFloat(); + } + return vector; + } + // Otherwise, we just skip this vector and continue + } + + throw new IOException("Could not read vector " + index); + } + } + + @Override + public int size() { + return vectorCount; + } + + @Override + public void close() throws IOException { + // No resources to close for file-based streaming + } + + public int getDimension() { + return dimension; + } +} diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/Lucene101AcceleratedHNSWCodec.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/Lucene101AcceleratedHNSWCodec.java index 52d677f..16fc74c 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/Lucene101AcceleratedHNSWCodec.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/Lucene101AcceleratedHNSWCodec.java @@ -4,20 +4,19 @@ */ package com.searchscale.lucene.cuvs.benchmarks; +import com.nvidia.cuvs.LibraryException; +import com.nvidia.cuvs.lucene.AcceleratedHNSWParams; +import com.nvidia.cuvs.lucene.Lucene99AcceleratedHNSWVectorsFormat; +import com.nvidia.cuvs.lucene.LuceneProvider; import java.lang.reflect.InvocationTargetException; import java.util.logging.Level; import java.util.logging.Logger; - import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; -import com.nvidia.cuvs.LibraryException; -import com.nvidia.cuvs.lucene.Lucene99AcceleratedHNSWVectorsFormat; -import com.nvidia.cuvs.lucene.LuceneProvider; - /** - * PLEASE NOTE: ADDING THIS CODEC CLASS IN THIS REPO IS A TEMPORARY MEASURE AS + * PLEASE NOTE: ADDING THIS CODEC CLASS IN THIS REPO IS A TEMPORARY MEASURE AS * THE RELEASED ARTIFACTS DO NOT HAVE CODECS EXPOSED * */ @@ -99,9 +98,16 @@ private void initializeFormat( int maxConn, int beamWidth) { try { - format = - new Lucene99AcceleratedHNSWVectorsFormat( - cuvsWriterThreads, intGraphDegree, graphDegree, hnswLayers, maxConn, beamWidth); + AcceleratedHNSWParams params = + new AcceleratedHNSWParams.Builder() + .withWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withHNSWLayer(hnswLayers) + .withMaxConn(maxConn) + .withBeamWidth(beamWidth) + .build(); + format = new Lucene99AcceleratedHNSWVectorsFormat(params); setKnnFormat(format); } catch (LibraryException ex) { log.log( diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/LuceneCuvsBenchmarks.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/LuceneCuvsBenchmarks.java index 75b74b6..092418e 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/LuceneCuvsBenchmarks.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/LuceneCuvsBenchmarks.java @@ -2,6 +2,7 @@ import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; +import com.nvidia.cuvs.lucene.GPUKnnFloatVectorQuery; import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -21,8 +22,8 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; - import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.time.StopWatch; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; @@ -60,15 +61,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.nvidia.cuvs.lucene.GPUKnnFloatVectorQuery; - public class LuceneCuvsBenchmarks { private static final Logger log = LoggerFactory.getLogger(LuceneCuvsBenchmarks.class.getName()); - private static boolean RESULTS_DEBUGGING = false; // when enabled, titles are indexed and printed after search - private static boolean INDEX_WRITER_INFO_STREAM = true; // when enabled, prints information about merges, deletes, - /** * Uses reflection to bypass the 2048MB hard limit for per-thread RAM buffer. * This is a workaround to allow larger segments to be created before flushing. @@ -78,7 +74,7 @@ private static void setPerThreadRAMLimit(IndexWriterConfig config, int limitMB) // First, try to find the field in IndexWriterConfig java.lang.reflect.Field field = null; Class clazz = config.getClass(); - + // Try to find the field in the class hierarchy while (clazz != null && field == null) { try { @@ -88,7 +84,7 @@ private static void setPerThreadRAMLimit(IndexWriterConfig config, int limitMB) clazz = clazz.getSuperclass(); } } - + if (field == null) { // If not found in IndexWriterConfig, try LiveIndexWriterConfig clazz = config.getClass().getSuperclass(); @@ -100,7 +96,7 @@ private static void setPerThreadRAMLimit(IndexWriterConfig config, int limitMB) } } } - + if (field != null) { field.setAccessible(true); field.setInt(config, limitMB); @@ -113,7 +109,6 @@ private static void setPerThreadRAMLimit(IndexWriterConfig config, int limitMB) } } - @SuppressWarnings("resource") public static void main(String[] args) throws Throwable { if (args.length < 1 || args.length > 3) { @@ -121,13 +116,14 @@ public static void main(String[] args) throws Throwable { return; } - BenchmarkConfiguration config = Util.newObjectMapper().readValue(new File(args[0]), BenchmarkConfiguration.class); - + BenchmarkConfiguration config = + Util.newObjectMapper().readValue(new File(args[0]), BenchmarkConfiguration.class); + // Override benchmarkID if provided as command line argument if (args.length >= 2) { config.benchmarkID = args[1]; } - + // Override resultsDirectory if provided as command line argument if (args.length >= 3) { config.resultsDirectory = args[2]; @@ -163,10 +159,13 @@ public static void main(String[] args) throws Throwable { } vectorProvider = new MemoryVectorProvider(loadedVectors); - log.info("Time taken to load {} vectors in-memory: {} ms", loadedVectors.size(), (System.currentTimeMillis() - start)); + log.info( + "Time taken to load {} vectors in-memory: {} ms", + loadedVectors.size(), + (System.currentTimeMillis() - start)); } else { log.info("Creating streaming vector provider (loadVectorsInMemory is disabled)"); - vectorProvider = new StreamingVectorProvider(config.datasetFile, config.numDocs); + vectorProvider = new StreamingVectorProvider(config.datasetFile, config.numDocs); } titles.add(config.vectorColName); @@ -193,7 +192,8 @@ public static void main(String[] args) throws Throwable { } if (config.loadVectorsInMemory) { - log.info("Mapdb loaded. Now loading all vectors in memory (loadVectorsInMemory is enabled)"); + log.info( + "Mapdb loaded. Now loading all vectors in memory (loadVectorsInMemory is enabled)"); long start = System.currentTimeMillis(); List loadedVectors = new ArrayList(vectors.size()); for (int i = 0; i < vectors.size(); i++) { @@ -201,7 +201,9 @@ public static void main(String[] args) throws Throwable { } vectorProvider = new MemoryVectorProvider(loadedVectors); db.close(); - log.info("Time taken to load the vectors in-memory is: {}", (System.currentTimeMillis() - start)); + log.info( + "Time taken to load the vectors in-memory is: {}", + (System.currentTimeMillis() - start)); } else { vectorProvider = new MapDBVectorProvider(vectors, db); } @@ -209,96 +211,104 @@ public static void main(String[] args) throws Throwable { try { - log.info("Time taken for parsing/loading dataset is {} ms", (System.currentTimeMillis() - parseStartTime)); + log.info( + "Time taken for parsing/loading dataset is {} ms", + (System.currentTimeMillis() - parseStartTime)); // [2] Benchmarking setup // HNSW Writer: IndexWriterConfig luceneHNSWWriterConfig = new IndexWriterConfig(new StandardAnalyzer()); luceneHNSWWriterConfig.setCodec(getLuceneHnswCodec(config)); - //luceneHNSWWriterConfig.setUseCompoundFile(false); + // luceneHNSWWriterConfig.setUseCompoundFile(false); // Configure to flush based on document count only // For 4M docs with 768-dim float vectors, we need approximately: // 4M * 768 * 4 bytes = ~12GB just for vectors, plus overhead // Set RAM buffer to 32GB to ensure doc count triggers flush first - + luceneHNSWWriterConfig.setRAMBufferSizeMB(config.ramBufferSizeMB); luceneHNSWWriterConfig.setMaxBufferedDocs(config.flushFreq); - luceneHNSWWriterConfig.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH); + luceneHNSWWriterConfig.setCheckPendingFlushUpdate(false); if (config.forceMerge > 0 || config.enableTieredMerge) { - luceneHNSWWriterConfig.setMergePolicy(new TieredMergePolicy()); + luceneHNSWWriterConfig.setMergePolicy(new TieredMergePolicy()); } else { luceneHNSWWriterConfig.setMergePolicy(NoMergePolicy.INSTANCE); } - + // Use reflection to bypass the 2048MB per-thread limit and set it to 10GB setPerThreadRAMLimit(luceneHNSWWriterConfig, 10240); // 10GB per thread - log.info("Configured HNSW writer - MaxBufferedDocs: {}, RAMBufferSizeMB: {}, PerThreadRAMLimit: {} MB", - config.flushFreq, luceneHNSWWriterConfig.getRAMBufferSizeMB(), - luceneHNSWWriterConfig.getRAMPerThreadHardLimitMB()); + log.info( + "Configured HNSW writer - MaxBufferedDocs: {}, RAMBufferSizeMB: {}, PerThreadRAMLimit: {}" + + " MB", + config.flushFreq, + luceneHNSWWriterConfig.getRAMBufferSizeMB(), + luceneHNSWWriterConfig.getRAMPerThreadHardLimitMB()); IndexWriterConfig cuvsIndexWriterConfig = new IndexWriterConfig(new StandardAnalyzer()); cuvsIndexWriterConfig.setCodec(getCuVSCodec(config)); - //cuvsIndexWriterConfig.setUseCompoundFile(false); + // cuvsIndexWriterConfig.setUseCompoundFile(false); // Configure to flush based on document count only // For 4M docs with 768-dim float vectors, we need approximately: // 4M * 768 * 4 bytes = ~12GB just for vectors, plus overhead // Set RAM buffer to 32GB to ensure doc count triggers flush first + cuvsIndexWriterConfig.setRAMBufferSizeMB(config.ramBufferSizeMB); cuvsIndexWriterConfig.setMaxBufferedDocs(config.flushFreq); - cuvsIndexWriterConfig.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH); + cuvsIndexWriterConfig.setCheckPendingFlushUpdate(false); if (config.forceMerge > 0 || config.enableTieredMerge) { - cuvsIndexWriterConfig.setMergePolicy(new TieredMergePolicy()); + cuvsIndexWriterConfig.setMergePolicy(new TieredMergePolicy()); } else { - cuvsIndexWriterConfig.setMergePolicy(NoMergePolicy.INSTANCE); + cuvsIndexWriterConfig.setMergePolicy(NoMergePolicy.INSTANCE); } - + // Use reflection to bypass the 2048MB per-thread limit and set it to 10GB setPerThreadRAMLimit(cuvsIndexWriterConfig, 10240); // 10GB per thread - log.info("Configured CuVS writer - MaxBufferedDocs: {}, RAMBufferSizeMB: {}, PerThreadRAMLimit: {} MB", - config.flushFreq, cuvsIndexWriterConfig.getRAMBufferSizeMB(), - cuvsIndexWriterConfig.getRAMPerThreadHardLimitMB()); - - if (INDEX_WRITER_INFO_STREAM) { + log.info( + "Configured CuVS writer - MaxBufferedDocs: {}, RAMBufferSizeMB: {}, PerThreadRAMLimit: {}" + + " MB", + config.flushFreq, + cuvsIndexWriterConfig.getRAMBufferSizeMB(), + cuvsIndexWriterConfig.getRAMPerThreadHardLimitMB()); + + if (config.enableIndexWriterInfoStream) { luceneHNSWWriterConfig.setInfoStream(new PrintStreamInfoStream(System.out)); cuvsIndexWriterConfig.setInfoStream(new PrintStreamInfoStream(System.out)); } - if (!config.skipIndexing) { - + if (!config.skipIndexing) { - IndexWriter luceneHnswIndexWriter = null; - IndexWriter cuvsIndexWriter = null; + IndexWriter luceneHnswIndexWriter = null; + IndexWriter cuvsIndexWriter = null; - - - if (config.algoToRun.equalsIgnoreCase("LUCENE_HNSW")) { - if (!config.createIndexInMemory) { - Path hnswIndex = Path.of(config.hnswIndexDirPath); - luceneHnswIndexWriter = new IndexWriter(FSDirectory.open(hnswIndex), luceneHNSWWriterConfig); - } else { - luceneHnswIndexWriter = new IndexWriter(new ByteBuffersDirectory(), luceneHNSWWriterConfig); - } - } else if (config.algoToRun.equalsIgnoreCase("CAGRA_HNSW")) { - if (!config.createIndexInMemory) { - Path cuvsIndex = Path.of(config.cuvsIndexDirPath); - cuvsIndexWriter = new IndexWriter(FSDirectory.open(cuvsIndex), cuvsIndexWriterConfig); - } else { - cuvsIndexWriter = new IndexWriter(new ByteBuffersDirectory(), cuvsIndexWriterConfig); + if (config.algoToRun.equalsIgnoreCase("LUCENE_HNSW")) { + if (!config.createIndexInMemory) { + Path hnswIndex = Path.of(config.hnswIndexDirPath); + luceneHnswIndexWriter = + new IndexWriter(FSDirectory.open(hnswIndex), luceneHNSWWriterConfig); + } else { + luceneHnswIndexWriter = + new IndexWriter(new ByteBuffersDirectory(), luceneHNSWWriterConfig); + } + } else if (config.algoToRun.equalsIgnoreCase("CAGRA_HNSW")) { + if (!config.createIndexInMemory) { + Path cuvsIndex = Path.of(config.cuvsIndexDirPath); + cuvsIndexWriter = new IndexWriter(FSDirectory.open(cuvsIndex), cuvsIndexWriterConfig); + } else { + cuvsIndexWriter = new IndexWriter(new ByteBuffersDirectory(), cuvsIndexWriterConfig); + } } - } - - IndexWriter writer; + IndexWriter writer; - if ("LUCENE_HNSW".equalsIgnoreCase(config.algoToRun)) { - writer = luceneHnswIndexWriter; - } else if ("CAGRA_HNSW".equalsIgnoreCase(config.algoToRun)) { - writer = cuvsIndexWriter; - } else { - throw new IllegalArgumentException("Please pass an acceptable option for `algoToRun`. Choices: LUCENE_HNSW, CAGRA_HNSW"); - } + if ("LUCENE_HNSW".equalsIgnoreCase(config.algoToRun)) { + writer = luceneHnswIndexWriter; + } else if ("CAGRA_HNSW".equalsIgnoreCase(config.algoToRun)) { + writer = cuvsIndexWriter; + } else { + throw new IllegalArgumentException( + "Please pass an acceptable option for `algoToRun`. Choices: LUCENE_HNSW, CAGRA_HNSW"); + } var formatName = writer.getConfig().getCodec().knnVectorsFormat().getName(); - + boolean isCuVSIndexing = formatName.equals("Lucene99AcceleratedHNSWVectorsFormat"); log.info("Indexing documents using {} ...", formatName); @@ -313,17 +323,21 @@ public static void main(String[] args) throws Throwable { log.info("Time taken for index building (end to end): {} ms", indexTimeTaken); - boolean usingFSDirectory = luceneHnswIndexWriter != null - ? luceneHnswIndexWriter.getDirectory() instanceof FSDirectory - : cuvsIndexWriter.getDirectory() instanceof FSDirectory; + boolean usingFSDirectory = + luceneHnswIndexWriter != null + ? luceneHnswIndexWriter.getDirectory() instanceof FSDirectory + : cuvsIndexWriter.getDirectory() instanceof FSDirectory; try { if (usingFSDirectory) { - Path indexPath = writer == cuvsIndexWriter ? Paths.get(config.cuvsIndexDirPath) - : Paths.get(config.hnswIndexDirPath); + Path indexPath = + writer == cuvsIndexWriter + ? Paths.get(config.cuvsIndexDirPath) + : Paths.get(config.hnswIndexDirPath); long directorySize; try (var stream = Files.walk(indexPath, FileVisitOption.FOLLOW_LINKS)) { - directorySize = stream.filter(p -> p.toFile().isFile()).mapToLong(p -> p.toFile().length()).sum(); + directorySize = + stream.filter(p -> p.toFile().isFile()).mapToLong(p -> p.toFile().length()).sum(); } double directorySizeGB = directorySize / 1_073_741_824.0; if (writer == cuvsIndexWriter) { @@ -334,22 +348,36 @@ public static void main(String[] args) throws Throwable { log.info("Size of {}: {} GB", indexPath.toString(), directorySizeGB); } } catch (IOException e) { - log.error("Failed to calculate directory size for {}", - writer == cuvsIndexWriter ? config.cuvsIndexDirPath : config.hnswIndexDirPath, e); + log.error( + "Failed to calculate directory size for {}", + writer == cuvsIndexWriter ? config.cuvsIndexDirPath : config.hnswIndexDirPath, + e); } - } - - Directory indexDir = MMapDirectory.open("CAGRA_HNSW".equals(config.algoToRun) ? Path.of(config.cuvsIndexDirPath) : Path.of(config.hnswIndexDirPath)); + } + + Directory indexDir = + MMapDirectory.open( + "CAGRA_HNSW".equals(config.algoToRun) + ? Path.of(config.cuvsIndexDirPath) + : Path.of(config.hnswIndexDirPath)); log.info("Index directory is: {} (using memory-mapped files)", indexDir); log.info("Querying documents using {} ...", config.algoToRun); // Always use standard Lucene search since we always create Lucene HNSW indexes - search(indexDir, config, false, metrics, queryResults, - Util.readGroundTruthFile(config.groundTruthFile)); - - Util.calculateRecallAccuracy(queryResults, metrics, "CAGRA_HNSW".equalsIgnoreCase(config.algoToRun)); - - String resultsJson = Util.newObjectMapper().writerWithDefaultPrettyPrinter() - .writeValueAsString(Map.of("configuration", config, "metrics", metrics)); + search( + indexDir, + config, + false, + metrics, + queryResults, + Util.readGroundTruthFile(config.groundTruthFile)); + + Util.calculateRecallAccuracy( + queryResults, metrics, "CAGRA_HNSW".equalsIgnoreCase(config.algoToRun)); + + String resultsJson = + Util.newObjectMapper() + .writerWithDefaultPrettyPrinter() + .writeValueAsString(Map.of("configuration", config, "metrics", metrics)); if (config.saveResultsOnDisk) { // Use the resultsDirectory directly if provided @@ -361,20 +389,19 @@ public static void main(String[] args) throws Throwable { // Save results.json directly to the specified directory FileUtils.write( - new File(results.toString() + "/results.json"), - resultsJson, Charset.forName("UTF-8")); - - // Save CSV with neighbors data + new File(results.toString() + "/results.json"), resultsJson, Charset.forName("UTF-8")); + + // Save CSV with neighbors data Util.writeCSV(queryResults, results.toString() + "/neighbors.csv"); - + log.info("Results saved to directory: {}", resultsDir); } log.info("\n-----\nOverall metrics: " + metrics + "\nMetrics: \n" + resultsJson + "\n-----"); - + // Close the index directory before cleaning indexDir.close(); - + // Clean index directory after benchmarks complete if requested if (config.cleanIndexDirectory && !config.createIndexInMemory) { Path indexPath = null; @@ -383,7 +410,7 @@ public static void main(String[] args) throws Throwable { } else if (config.algoToRun.equalsIgnoreCase("CAGRA_HNSW")) { indexPath = Path.of(config.cuvsIndexDirPath); } - + if (indexPath != null) { try { log.info("Cleaning index directory: {}", indexPath); @@ -401,70 +428,94 @@ public static void main(String[] args) throws Throwable { } } - private static void indexDocuments(IndexWriter writer, BenchmarkConfiguration config, List titles, - VectorProvider vectorProvider) throws IOException, InterruptedException { + private static void indexDocuments( + IndexWriter writer, + BenchmarkConfiguration config, + List titles, + VectorProvider vectorProvider) + throws IOException, InterruptedException { int threads = config.numIndexThreads; ExecutorService pool = Executors.newFixedThreadPool(threads); AtomicInteger numDocsIndexed = new AtomicInteger(0); log.info("Starting indexing with {} threads.", threads); - log.info("IndexWriter config - MaxBufferedDocs: {}, RAMBufferSizeMB: {}", - writer.getConfig().getMaxBufferedDocs(), writer.getConfig().getRAMBufferSizeMB()); + log.info( + "IndexWriter config - MaxBufferedDocs: {}, RAMBufferSizeMB: {}, ICPF: {}", + writer.getConfig().getMaxBufferedDocs(), + writer.getConfig().getRAMBufferSizeMB(), + writer.getConfig().isCheckPendingFlushOnUpdate()); final int numDocsToIndex = Math.min(config.numDocs, vectorProvider.size()); - + StopWatch st = StopWatch.createStarted(); for (int i = 0; i < threads; i++) { - pool.submit(() -> { - int localCount = 0; - while (true) { - int id = numDocsIndexed.getAndIncrement(); - if (id >= numDocsToIndex) { - break; // done - } - float[] vector; - try { - vector = Objects.requireNonNull(vectorProvider.get(id)); - localCount++; - } catch (IOException e) { - throw new UncheckedIOException("Failed to read vector at index " + id, e); - } - Document doc = new Document(); - doc.add(new StringField("id", String.valueOf(id), Field.Store.YES)); - doc.add(new KnnFloatVectorField(config.vectorColName, vector, EUCLIDEAN)); - if (RESULTS_DEBUGGING) - doc.add(new StringField("title", titles.get(id), Field.Store.YES)); - try { - writer.addDocument(doc); - if ((id + 1) % 25000 == 0) { - log.info("Done indexing {} documents. Pending docs: {}", (id + 1), writer.getPendingNumDocs()); + pool.submit( + () -> { + while (true) { + int id = numDocsIndexed.getAndIncrement(); + if (id >= numDocsToIndex) { + break; // done + } + float[] vector; + try { + vector = Objects.requireNonNull(vectorProvider.get(id)); + } catch (IOException e) { + throw new UncheckedIOException("Failed to read vector at index " + id, e); + } + Document doc = new Document(); + doc.add(new StringField("id", String.valueOf(id), Field.Store.YES)); + doc.add(new KnnFloatVectorField(config.vectorColName, vector, EUCLIDEAN)); + try { + writer.addDocument(doc); + if ((id + 1) % 25000 == 0) { + log.info( + "Done indexing {} documents. Pending docs: {}", + (id + 1), + writer.getPendingNumDocs()); + } + // Log when we expect a flush + if ((id + 1) == config.flushFreq || (id + 1) == 2 * config.flushFreq) { + log.info("Expected flush point reached at {} documents", (id + 1)); + } + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } } - // Log when we expect a flush - if ((id + 1) == config.flushFreq || (id + 1) == 2 * config.flushFreq) { - log.info("Expected flush point reached at {} documents", (id + 1)); - } - } catch (IOException ex) { - throw new UncheckedIOException(ex); - } - } - }); + }); } + System.out.println("------------- !!!!!! idi1 " + st.getTime()); + pool.shutdown(); pool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); + System.out.println("------------- !!!!!! idi2 " + st.getTime()); + if (config.forceMerge > 0) { - log.info("Force merge is enabled."); - writer.forceMerge(config.forceMerge); + log.info("Force merge is enabled."); + writer.forceMerge(config.forceMerge); } - - // log.info("Calling forceMerge(1)."); - // writer.forceMerge(1); - log.info("Calling commit."); - writer.commit(); + + System.out.println("------------- !!!!!! idi3 " + st.getTime()); + + if (writer.hasUncommittedChanges()) { + StopWatch st1 = StopWatch.createStarted(); + log.info("Calling commit."); + writer.commit(); + st1.stop(); + System.out.println("----------- !!!!!!!!!!! commit " + st1.getTime(TimeUnit.MILLISECONDS)); + } + st.stop(); + System.out.println( + "----------- !!!!!!!!!!! indexDocuments internal " + st.getTime(TimeUnit.MILLISECONDS)); writer.close(); } - private static void search(Directory directory, BenchmarkConfiguration config, boolean useCuVS, - Map metrics, List queryResults, List groundTruth) { - + private static void search( + Directory directory, + BenchmarkConfiguration config, + boolean useCuVS, + Map metrics, + List queryResults, + List groundTruth) { + DB db = null; try (IndexReader indexReader = DirectoryReader.open(directory)) { IndexSearcher indexSearcher = new IndexSearcher(indexReader); @@ -477,16 +528,17 @@ private static void search(Directory directory, BenchmarkConfiguration config, b db = DBMaker.fileDB(queryMapdbFile).make(); queries = db.indexTreeList("vectors", SERIALIZER.FLOAT_ARRAY).createOrOpen(); - if (config.queryFile.endsWith(".csv")) { - for (String line : FileUtils.readFileToString(new File(config.queryFile), "UTF-8").split("\n")) { - queries.add(Util.parseFloatArrayFromStringArray(line)); - } - } else if (config.queryFile.contains("fvecs")) { - FBIvecsReader.readFvecs(config.queryFile, -1, queries); - } else if (config.queryFile.contains("fbin")) { - FBIvecsReader.readFbin(config.queryFile, -1, queries); - } else if (config.queryFile.contains("bvecs")) { - FBIvecsReader.readBvecs(config.queryFile, -1, queries); + if (config.queryFile.endsWith(".csv")) { + for (String line : + FileUtils.readFileToString(new File(config.queryFile), "UTF-8").split("\n")) { + queries.add(Util.parseFloatArrayFromStringArray(line)); + } + } else if (config.queryFile.contains("fvecs")) { + FBIvecsReader.readFvecs(config.queryFile, -1, queries); + } else if (config.queryFile.contains("fbin")) { + FBIvecsReader.readFbin(config.queryFile, -1, queries); + } else if (config.queryFile.contains("bvecs")) { + FBIvecsReader.readBvecs(config.queryFile, -1, queries); } log.info("Mapdb file created with {} number of queries", queries.size()); } else { @@ -497,94 +549,142 @@ private static void search(Directory directory, BenchmarkConfiguration config, b } int qThreads = config.queryThreads; - if (useCuVS) - qThreads = 1; + if (useCuVS) qThreads = 1; ExecutorService pool = Executors.newFixedThreadPool(qThreads); AtomicInteger queriesFinished = new AtomicInteger(0); ConcurrentHashMap queryLatencies = new ConcurrentHashMap(); - ConcurrentHashMap retrievalLatencies = new ConcurrentHashMap(); + ConcurrentHashMap retrievalLatencies = + new ConcurrentHashMap(); long startTime = System.currentTimeMillis(); AtomicInteger queryId = new AtomicInteger(0); - queries.stream().limit(config.numQueriesToRun).forEach((queryVector) -> { - // Get a unique query ID for this query before submitting to thread pool - int currentQueryId = queryId.getAndIncrement(); - pool.submit(() -> { - KnnFloatVectorQuery query; - - if (useCuVS) { - int effectiveEfSearch = config.getEffectiveEfSearch(); - query = new GPUKnnFloatVectorQuery(config.vectorColName, queryVector, effectiveEfSearch, null, config.cagraITopK, - config.cagraSearchWidth); - } else { - int effectiveEfSearch = config.getEffectiveEfSearch(); - query = new KnnFloatVectorQuery(config.vectorColName, queryVector, effectiveEfSearch); - } - - TopDocs topDocs; - long searchStartTime = System.nanoTime(); - try { - int effectiveEfSearch = config.getEffectiveEfSearch(); - TopScoreDocCollectorManager collectorManager = new TopScoreDocCollectorManager(effectiveEfSearch, null, - Integer.MAX_VALUE, true); - topDocs = indexSearcher.search(query, collectorManager); - } catch (IOException e) { - throw new RuntimeException("Problem during executing a query: ", e); - } - double searchTimeTakenMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - searchStartTime); - // log.info("End to end search took: " + searchTimeTakenMs); - if (currentQueryId > config.numWarmUpQueries) { - queryLatencies.put(queryId.get(), searchTimeTakenMs); - } - int finishedCount = queriesFinished.incrementAndGet(); - - // Log progress every 2 queries - if (finishedCount % 2 == 0 || finishedCount == config.numQueriesToRun) { - log.info("Done querying " + finishedCount + " out of " + config.numQueriesToRun + " queries."); - } - - ScoreDoc[] hits = topDocs.scoreDocs; - List neighbors = new ArrayList<>(); - List scores = new ArrayList<>(); - - // Debug: Log search results for first query - if (queryId.get() == 0) { - log.info("Debug: First query returned " + hits.length + " hits (ef-search candidates)"); - log.info("Debug: Will select top " + config.topK + " from " + hits.length + " candidates"); - } - int numResultsToTake = Math.min(config.topK, hits.length); - long retrievalStartTime = System.nanoTime(); - for (int i = 0; i < numResultsToTake; i++) { - ScoreDoc hit = hits[i]; - try { - Document d = indexReader.storedFields().document(hit.doc); - neighbors.add(Integer.parseInt(d.get("id"))); - } catch (IOException e) { - e.printStackTrace(); - } - scores.add(hit.score); - } - double retrievalTimeTakenMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - retrievalStartTime); - if (currentQueryId > config.numWarmUpQueries) { - retrievalLatencies.put(queryId.get(), retrievalTimeTakenMs); - } - - // Debug: Log results for all queries - log.info("Query " + currentQueryId + " - First 5 neighbors: " + neighbors.subList(0, Math.min(5, neighbors.size()))); - log.info("Query " + currentQueryId + " - First 5 distances: " + scores.subList(0, Math.min(5, scores.size()))); - int[] expectedNeighbors = groundTruth.get(currentQueryId); - log.info("Query " + currentQueryId + " - Expected neighbors: " + java.util.Arrays.toString(java.util.Arrays.copyOf(expectedNeighbors, Math.min(5, expectedNeighbors.length)))); - - var s = useCuVS ? "lucene_cuvs" : "lucene_hnsw"; - if (currentQueryId > config.numWarmUpQueries) { - QueryResult result = new QueryResult(s, currentQueryId, neighbors, groundTruth.get(currentQueryId), scores, - searchTimeTakenMs); - queryResults.add(result); - } else { - log.info("Skipping warmup query: {}", currentQueryId); - } - }); - }); + queries.stream() + .limit(config.numQueriesToRun) + .forEach( + (queryVector) -> { + // Get a unique query ID for this query before submitting to thread pool + int currentQueryId = queryId.getAndIncrement(); + pool.submit( + () -> { + KnnFloatVectorQuery query; + + if (useCuVS) { + int effectiveEfSearch = config.getEffectiveEfSearch(); + query = + new GPUKnnFloatVectorQuery( + config.vectorColName, + queryVector, + effectiveEfSearch, + null, + config.cagraITopK, + config.cagraSearchWidth); + } else { + int effectiveEfSearch = config.getEffectiveEfSearch(); + query = + new KnnFloatVectorQuery( + config.vectorColName, queryVector, effectiveEfSearch); + } + + TopDocs topDocs; + long searchStartTime = System.nanoTime(); + try { + int effectiveEfSearch = config.getEffectiveEfSearch(); + TopScoreDocCollectorManager collectorManager = + new TopScoreDocCollectorManager( + effectiveEfSearch, null, Integer.MAX_VALUE, true); + topDocs = indexSearcher.search(query, collectorManager); + } catch (IOException e) { + throw new RuntimeException("Problem during executing a query: ", e); + } + double searchTimeTakenMs = + TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - searchStartTime); + // log.info("End to end search took: " + searchTimeTakenMs); + if (currentQueryId > config.numWarmUpQueries) { + queryLatencies.put(queryId.get(), searchTimeTakenMs); + } + int finishedCount = queriesFinished.incrementAndGet(); + + // Log progress every 1000 queries + if (finishedCount % 1000 == 0 || finishedCount == config.numQueriesToRun) { + log.info( + "Done querying " + + finishedCount + + " out of " + + config.numQueriesToRun + + " queries."); + } + + ScoreDoc[] hits = topDocs.scoreDocs; + List neighbors = new ArrayList<>(); + List scores = new ArrayList<>(); + + // Debug: Log search results for first query + if (queryId.get() == 0) { + log.info( + "Debug: First query returned " + + hits.length + + " hits (ef-search candidates)"); + log.info( + "Debug: Will select top " + + config.topK + + " from " + + hits.length + + " candidates"); + } + int numResultsToTake = Math.min(config.topK, hits.length); + long retrievalStartTime = System.nanoTime(); + for (int i = 0; i < numResultsToTake; i++) { + ScoreDoc hit = hits[i]; + try { + Document d = indexReader.storedFields().document(hit.doc); + neighbors.add(Integer.parseInt(d.get("id"))); + } catch (IOException e) { + e.printStackTrace(); + } + scores.add(hit.score); + } + double retrievalTimeTakenMs = + TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - retrievalStartTime); + if (currentQueryId > config.numWarmUpQueries) { + retrievalLatencies.put(queryId.get(), retrievalTimeTakenMs); + } + + // Debug: Log results for all queries + log.debug( + "Query " + + currentQueryId + + " - First 5 neighbors: " + + neighbors.subList(0, Math.min(5, neighbors.size()))); + log.debug( + "Query " + + currentQueryId + + " - First 5 distances: " + + scores.subList(0, Math.min(5, scores.size()))); + int[] expectedNeighbors = groundTruth.get(currentQueryId); + log.debug( + "Query " + + currentQueryId + + " - Expected neighbors: " + + java.util.Arrays.toString( + java.util.Arrays.copyOf( + expectedNeighbors, Math.min(5, expectedNeighbors.length)))); + + var s = useCuVS ? "lucene_cuvs" : "lucene_hnsw"; + if (currentQueryId > config.numWarmUpQueries) { + QueryResult result = + new QueryResult( + s, + currentQueryId, + neighbors, + groundTruth.get(currentQueryId), + scores, + searchTimeTakenMs); + queryResults.add(result); + } else { + log.info("Skipping warmup query: {}", currentQueryId); + } + }); + }); pool.shutdown(); pool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); @@ -592,11 +692,14 @@ private static void search(Directory directory, BenchmarkConfiguration config, b long endTime = System.currentTimeMillis(); metrics.put((useCuVS ? "cuvs" : "hnsw") + "-query-time", (endTime - startTime)); - metrics.put((useCuVS ? "cuvs" : "hnsw") + "-query-throughput", - (queryLatencies.size() / ((endTime - startTime) / 1000.0))); - double avgLatency = new ArrayList<>(queryLatencies.values()).stream().reduce(0.0, Double::sum) - / queryLatencies.size(); - double avgRetLatency = new ArrayList<>(retrievalLatencies.values()).stream().reduce(0.0, Double::sum) + metrics.put( + (useCuVS ? "cuvs" : "hnsw") + "-query-throughput", + (config.numQueriesToRun / ((endTime - startTime) / 1000.0))); + double avgLatency = + new ArrayList<>(queryLatencies.values()).stream().reduce(0.0, Double::sum) + / queryLatencies.size(); + double avgRetLatency = + new ArrayList<>(retrievalLatencies.values()).stream().reduce(0.0, Double::sum) / retrievalLatencies.size(); metrics.put((useCuVS ? "cuvs" : "hnsw") + "-mean-latency", avgLatency); @@ -618,10 +721,20 @@ private static void search(Directory directory, BenchmarkConfiguration config, b private static Lucene101Codec getLuceneHnswCodec(BenchmarkConfiguration config) { return new Lucene101Codec(Mode.BEST_SPEED) { - @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - KnnVectorsFormat knnFormat = new Lucene99HnswVectorsFormat(config.hnswMaxConn, config.hnswBeamWidth); + KnnVectorsFormat knnFormat; + if (config.hnswMergeThreads > 1) { + ExecutorService executorService = Executors.newFixedThreadPool(config.hnswMergeThreads); + knnFormat = + new Lucene99HnswVectorsFormat( + config.hnswMaxConn, + config.hnswBeamWidth, + config.hnswMergeThreads, + executorService); + } else { + knnFormat = new Lucene99HnswVectorsFormat(config.hnswMaxConn, config.hnswBeamWidth); + } // KnnVectorsFormat knnFormat = new Lucene99HnswVectorsFormat(DEFAULT_MAX_CONN, // DEFAULT_BEAM_WIDTH); return new HighDimensionKnnVectorsFormat(knnFormat, config.vectorDimension); @@ -631,7 +744,8 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { private static Codec getCuVSCodec(BenchmarkConfiguration config) throws Exception { // Use Lucene101AcceleratedHNSWCodec with configurable parameters - // Constructor signature: (cuvsWriterThreads, intGraphDegree, graphDegree, hnswLayers, maxConn, beamWidth) + // Constructor signature: (cuvsWriterThreads, intGraphDegree, graphDegree, hnswLayers, maxConn, + // beamWidth) return new Lucene101AcceleratedHNSWCodec( config.cuvsWriterThreads, config.cagraIntermediateGraphDegree, diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MapDBToFvecs.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MapDBToFvecs.java index 2da0c00..ea6d908 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MapDBToFvecs.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MapDBToFvecs.java @@ -1,78 +1,76 @@ -package com.searchscale.lucene.cuvs.benchmarks; - -import java.io.DataOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.List; - -import org.mapdb.DB; -import org.mapdb.DBMaker; -import org.mapdb.IndexTreeList; -import org.mapdb.QueueLong.Node.SERIALIZER; - -public class MapDBToFvecs { - - public static void main(String[] args) throws IOException { - if (args.length != 2) { - System.err.println("Usage: MapDBToFvecs "); - System.exit(1); - } - - String inputMapDBFile = args[0]; - String outputFvecsFile = args[1]; - - System.out.println("Opening MapDB file: " + inputMapDBFile); - DB db = DBMaker.fileDB(inputMapDBFile).make(); - IndexTreeList vectors = db.indexTreeList("vectors", SERIALIZER.FLOAT_ARRAY).createOrOpen(); - - int numVectors = vectors.size(); - System.out.println("Found " + numVectors + " vectors in MapDB"); - - if (numVectors == 0) { - System.err.println("No vectors found in MapDB file!"); - db.close(); - return; - } - - // Get dimension from first vector - float[] firstVector = vectors.get(0); - int dimension = firstVector.length; - System.out.println("Vector dimension: " + dimension); - - // Write to fvecs format - System.out.println("Writing to fvecs file: " + outputFvecsFile); - try (FileOutputStream fos = new FileOutputStream(outputFvecsFile); - FileChannel channel = fos.getChannel()) { - - ByteBuffer buffer = ByteBuffer.allocate(4 + 4 * dimension); - buffer.order(ByteOrder.LITTLE_ENDIAN); - - for (int i = 0; i < numVectors; i++) { - if (i % 10000 == 0) { - System.out.println("Progress: " + i + "/" + numVectors + " vectors written"); - } - - float[] vector = vectors.get(i); - - // Write dimension - buffer.clear(); - buffer.putInt(dimension); - - // Write vector components - for (float value : vector) { - buffer.putFloat(value); - } - - buffer.flip(); - channel.write(buffer); - } - - System.out.println("Successfully converted " + numVectors + " vectors to fvecs format"); - } - - db.close(); - } -} \ No newline at end of file +package com.searchscale.lucene.cuvs.benchmarks; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.QueueLong.Node.SERIALIZER; + +public class MapDBToFvecs { + + public static void main(String[] args) throws IOException { + if (args.length != 2) { + System.err.println("Usage: MapDBToFvecs "); + System.exit(1); + } + + String inputMapDBFile = args[0]; + String outputFvecsFile = args[1]; + + System.out.println("Opening MapDB file: " + inputMapDBFile); + DB db = DBMaker.fileDB(inputMapDBFile).make(); + IndexTreeList vectors = + db.indexTreeList("vectors", SERIALIZER.FLOAT_ARRAY).createOrOpen(); + + int numVectors = vectors.size(); + System.out.println("Found " + numVectors + " vectors in MapDB"); + + if (numVectors == 0) { + System.err.println("No vectors found in MapDB file!"); + db.close(); + return; + } + + // Get dimension from first vector + float[] firstVector = vectors.get(0); + int dimension = firstVector.length; + System.out.println("Vector dimension: " + dimension); + + // Write to fvecs format + System.out.println("Writing to fvecs file: " + outputFvecsFile); + try (FileOutputStream fos = new FileOutputStream(outputFvecsFile); + FileChannel channel = fos.getChannel()) { + + ByteBuffer buffer = ByteBuffer.allocate(4 + 4 * dimension); + buffer.order(ByteOrder.LITTLE_ENDIAN); + + for (int i = 0; i < numVectors; i++) { + if (i % 10000 == 0) { + System.out.println("Progress: " + i + "/" + numVectors + " vectors written"); + } + + float[] vector = vectors.get(i); + + // Write dimension + buffer.clear(); + buffer.putInt(dimension); + + // Write vector components + for (float value : vector) { + buffer.putFloat(value); + } + + buffer.flip(); + channel.write(buffer); + } + + System.out.println("Successfully converted " + numVectors + " vectors to fvecs format"); + } + + db.close(); + } +} diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MapDBVectorProvider.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MapDBVectorProvider.java index 6352251..bd10215 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MapDBVectorProvider.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MapDBVectorProvider.java @@ -1,39 +1,39 @@ -package com.searchscale.lucene.cuvs.benchmarks; - -import java.io.IOException; - -import org.mapdb.DB; -import org.mapdb.IndexTreeList; - -/** - * Vector provider that uses MapDB as the backing store - */ -public class MapDBVectorProvider implements VectorProvider { - private final IndexTreeList vectors; - private final DB db; - - public MapDBVectorProvider(IndexTreeList vectors, DB db) { - this.vectors = vectors; - this.db = db; - } - - @Override - public float[] get(int index) throws IOException { - if (index < 0 || index >= vectors.size()) { - throw new IndexOutOfBoundsException("Index " + index + " out of bounds [0, " + vectors.size() + ")"); - } - return vectors.get(index); - } - - @Override - public int size() { - return vectors.size(); - } - - @Override - public void close() throws IOException { - if (db != null) { - db.close(); - } - } -} \ No newline at end of file +package com.searchscale.lucene.cuvs.benchmarks; + +import java.io.IOException; +import org.mapdb.DB; +import org.mapdb.IndexTreeList; + +/** + * Vector provider that uses MapDB as the backing store + */ +public class MapDBVectorProvider implements VectorProvider { + private final IndexTreeList vectors; + private final DB db; + + public MapDBVectorProvider(IndexTreeList vectors, DB db) { + this.vectors = vectors; + this.db = db; + } + + @Override + public float[] get(int index) throws IOException { + if (index < 0 || index >= vectors.size()) { + throw new IndexOutOfBoundsException( + "Index " + index + " out of bounds [0, " + vectors.size() + ")"); + } + return vectors.get(index); + } + + @Override + public int size() { + return vectors.size(); + } + + @Override + public void close() throws IOException { + if (db != null) { + db.close(); + } + } +} diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MemoryVectorProvider.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MemoryVectorProvider.java index bd2b4aa..ad0384d 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MemoryVectorProvider.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/MemoryVectorProvider.java @@ -1,33 +1,34 @@ -package com.searchscale.lucene.cuvs.benchmarks; - -import java.io.IOException; -import java.util.List; - -/** - * In-memory vector provider that holds all vectors in RAM - */ -public class MemoryVectorProvider implements VectorProvider { - private final List vectors; - - public MemoryVectorProvider(List vectors) { - this.vectors = vectors; - } - - @Override - public float[] get(int index) throws IOException { - if (index < 0 || index >= vectors.size()) { - throw new IndexOutOfBoundsException("Index " + index + " out of bounds [0, " + vectors.size() + ")"); - } - return vectors.get(index); - } - - @Override - public int size() { - return vectors.size(); - } - - @Override - public void close() throws IOException { - // No resources to close - } -} \ No newline at end of file +package com.searchscale.lucene.cuvs.benchmarks; + +import java.io.IOException; +import java.util.List; + +/** + * In-memory vector provider that holds all vectors in RAM + */ +public class MemoryVectorProvider implements VectorProvider { + private final List vectors; + + public MemoryVectorProvider(List vectors) { + this.vectors = vectors; + } + + @Override + public float[] get(int index) throws IOException { + if (index < 0 || index >= vectors.size()) { + throw new IndexOutOfBoundsException( + "Index " + index + " out of bounds [0, " + vectors.size() + ")"); + } + return vectors.get(index); + } + + @Override + public int size() { + return vectors.size(); + } + + @Override + public void close() throws IOException { + // No resources to close + } +} diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/QueryResult.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/QueryResult.java index 310d877..188aec0 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/QueryResult.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/QueryResult.java @@ -1,31 +1,41 @@ package com.searchscale.lucene.cuvs.benchmarks; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.core.JsonProcessingException; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import com.fasterxml.jackson.annotation.JsonProperty; -import com.fasterxml.jackson.core.JsonProcessingException; - public class QueryResult { @JsonProperty("codec") final String codec; + @JsonProperty("query-id") - final public int queryId; + public final int queryId; + @JsonProperty("docs") final List docs; + @JsonProperty("ground-truth") final int[] groundTruth; + @JsonProperty("scores") final List scores; + @JsonProperty("latency") final double latencyMs; + @JsonProperty("recall") double recall; - public QueryResult(String codec, int id, List docs, int[] groundTruth, List scores, + public QueryResult( + String codec, + int id, + List docs, + int[] groundTruth, + List scores, double latencyMs) { this.codec = codec; this.queryId = id; @@ -40,10 +50,17 @@ private void calculateRecallAccuracy() { // Validate that ground truth has enough elements for the requested topK if (groundTruth.length < docs.size()) { - System.err.println("WARNING: Ground truth contains only " + groundTruth.length + - " elements, but topK=" + docs.size() + " was requested."); + System.err.println( + "WARNING: Ground truth contains only " + + groundTruth.length + + " elements, but topK=" + + docs.size() + + " was requested."); System.err.println("Cannot calculate accurate recall with insufficient ground truth data."); - System.err.println("Please reduce topK to " + groundTruth.length + " or less, or use ground truth with more elements."); + System.err.println( + "Please reduce topK to " + + groundTruth.length + + " or less, or use ground truth with more elements."); System.exit(1); } @@ -52,8 +69,11 @@ private void calculateRecallAccuracy() { topKGroundtruthValues.add(groundTruth[i]); } - Set matchingRecallValues = docs.stream().distinct().filter(topKGroundtruthValues::contains) - .collect(Collectors.toSet()); + Set matchingRecallValues = + docs.stream() + .distinct() + .filter(topKGroundtruthValues::contains) + .collect(Collectors.toSet()); // docs.size() is the topK value this.recall = ((double) matchingRecallValues.size() / (double) docs.size()); diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/StreamingVectorProvider.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/StreamingVectorProvider.java index 44ef6c8..4dbd8fa 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/StreamingVectorProvider.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/StreamingVectorProvider.java @@ -15,310 +15,338 @@ * without creating intermediate MapDB files */ public class StreamingVectorProvider implements VectorProvider { - private static final Logger log = LoggerFactory.getLogger(StreamingVectorProvider.class.getName()); - - private enum FileFormat { - FVECS, FBIN, BVECS, IVECS + private static final Logger log = + LoggerFactory.getLogger(StreamingVectorProvider.class.getName()); + + private enum FileFormat { + FVECS, + FBIN, + BVECS, + IVECS + } + + private final String filePath; + private final int dimension; + private final int vectorCount; + private final long vectorSize; // size of one vector in bytes + private final boolean isCompressed; + private final FileFormat format; + + public StreamingVectorProvider(String filePath, int maxVectors) throws IOException { + this.filePath = filePath; + this.isCompressed = filePath.endsWith(".gz"); + + // Determine file format + if (filePath.contains("fvecs")) { + this.format = FileFormat.FVECS; + } else if (filePath.contains("fbin")) { + this.format = FileFormat.FBIN; + } else if (filePath.contains("bvecs")) { + this.format = FileFormat.BVECS; + } else if (filePath.contains("ivecs")) { + this.format = FileFormat.IVECS; + } else { + throw new IllegalArgumentException("Unsupported file format: " + filePath); } - private final String filePath; - private final int dimension; - private final int vectorCount; - private final long vectorSize; // size of one vector in bytes - private final boolean isCompressed; - private final FileFormat format; - - public StreamingVectorProvider(String filePath, int maxVectors) throws IOException { - this.filePath = filePath; - this.isCompressed = filePath.endsWith(".gz"); - - // Determine file format - if (filePath.contains("fvecs")) { - this.format = FileFormat.FVECS; - } else if (filePath.contains("fbin")) { - this.format = FileFormat.FBIN; - } else if (filePath.contains("bvecs")) { - this.format = FileFormat.BVECS; - } else if (filePath.contains("ivecs")) { - this.format = FileFormat.IVECS; - } else { - throw new IllegalArgumentException("Unsupported file format: " + filePath); - } + // Read header and initialize fields based on format + int tempDimension; + int tempVectorCount; + long tempVectorSize; - // Read header and initialize fields based on format - int tempDimension; - int tempVectorCount; - long tempVectorSize; - - try (FileInputStream fis = new FileInputStream(filePath)) { - java.io.InputStream is = isCompressed ? new GZIPInputStream(fis) : fis; - - if (format == FileFormat.FBIN) { - // For .fbin: Read num_vectors first, then dimension - byte[] numVecBytes = is.readNBytes(4); - ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN); - int numVectors = numVecBuffer.getInt(); - - byte[] dimBytes = is.readNBytes(4); - ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN); - tempDimension = dimBuffer.getInt(); - - tempVectorCount = maxVectors > 0 ? Math.min(maxVectors, numVectors) : numVectors; - tempVectorSize = 4L * tempDimension; // just dimension floats (no per-vector dimension) - - log.info("File header - total vectors: {}, dimension: {}", numVectors, tempDimension); - } else { - // For other formats: Read dimension normally using existing method - tempDimension = FBIvecsReader.getDimension(is); - - // Calculate vector size based on format - switch (format) { - case FVECS: - case IVECS: - tempVectorSize = 4 + 4L * tempDimension; // dimension int + dimension values - break; - case BVECS: - tempVectorSize = 4 + tempDimension; // dimension int + dimension bytes - break; - default: - throw new IllegalArgumentException("Unsupported format: " + format); - } - - // Calculate vector count - if (isCompressed) { - tempVectorCount = countVectorsInCompressedFile(maxVectors, tempDimension, tempVectorSize); - } else { - tempVectorCount = calculateVectorCount(maxVectors, tempVectorSize); - } - } - } + try (FileInputStream fis = new FileInputStream(filePath)) { + java.io.InputStream is = isCompressed ? new GZIPInputStream(fis) : fis; - // Single assignment to final fields - this.dimension = tempDimension; - this.vectorCount = tempVectorCount; - this.vectorSize = tempVectorSize; + if (format == FileFormat.FBIN) { + // For .fbin: Read num_vectors first, then dimension + byte[] numVecBytes = is.readNBytes(4); + ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN); + int numVectors = numVecBuffer.getInt(); - log.info("StreamingVectorProvider initialized: {} vectors, {} dimensions, format: {}", - vectorCount, dimension, format); - } + byte[] dimBytes = is.readNBytes(4); + ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN); + tempDimension = dimBuffer.getInt(); - private int calculateVectorCount(int maxVectors, long vectorSize) throws IOException { - try (RandomAccessFile raf = new RandomAccessFile(filePath, "r")) { - long fileSize = raf.length(); - long totalVectors = fileSize / vectorSize; - return maxVectors > 0 ? Math.min((int)totalVectors, maxVectors) : (int)totalVectors; - } - } + tempVectorCount = maxVectors > 0 ? Math.min(maxVectors, numVectors) : numVectors; + tempVectorSize = 4L * tempDimension; // just dimension floats (no per-vector dimension) - private int countVectorsInCompressedFile(int maxVectors, int dimension, long vectorSize) throws IOException { - int count = 0; - try (FileInputStream fis = new FileInputStream(filePath); - GZIPInputStream gzis = new GZIPInputStream(fis)) { - - // Skip dimension of first vector (already read) - if (format == FileFormat.BVECS) { - gzis.skip(dimension); - } else { - gzis.skip(4L * dimension); - } - - byte[] buffer = new byte[4096]; - long bytesPerVector = vectorSize; - long totalBytesRead = 4 + (format == FileFormat.BVECS ? dimension : 4L * dimension); - - while (gzis.available() > 0 && (maxVectors <= 0 || count < maxVectors - 1)) { - int bytesRead = gzis.read(buffer); - if (bytesRead == -1) break; - totalBytesRead += bytesRead; - count = (int)(totalBytesRead / bytesPerVector); - if (maxVectors > 0 && count >= maxVectors) { - count = maxVectors; - break; - } - } - return Math.max(1, count); - } - } + log.info("File header - total vectors: {}, dimension: {}", numVectors, tempDimension); + } else { + // For other formats: Read dimension normally using existing method + tempDimension = FBIvecsReader.getDimension(is); - @Override - public float[] get(int index) throws IOException { - if (index < 0 || index >= vectorCount) { - throw new IndexOutOfBoundsException("Index " + index + " out of bounds [0, " + vectorCount + ")"); + // Calculate vector size based on format + switch (format) { + case FVECS: + case IVECS: + tempVectorSize = 4 + 4L * tempDimension; // dimension int + dimension values + break; + case BVECS: + tempVectorSize = 4 + tempDimension; // dimension int + dimension bytes + break; + default: + throw new IllegalArgumentException("Unsupported format: " + format); } + // Calculate vector count if (isCompressed) { - return getFromCompressedFile(index); + tempVectorCount = countVectorsInCompressedFile(maxVectors, tempDimension, tempVectorSize); } else { - return getFromUncompressedFile(index); + tempVectorCount = calculateVectorCount(maxVectors, tempVectorSize); } + } } - private float[] getFromUncompressedFile(int index) throws IOException { - try (RandomAccessFile raf = new RandomAccessFile(filePath, "r"); - FileChannel channel = raf.getChannel()) { - - long position; - if (format == FileFormat.FBIN) { - // Skip initial header (8 bytes: num_vectors + dimension) and go to vector position - position = 8 + index * vectorSize; - } else { - // Standard position calculation for other formats - position = index * vectorSize; - } - - channel.position(position); - ByteBuffer buffer = ByteBuffer.allocate((int)vectorSize); - buffer.order(ByteOrder.LITTLE_ENDIAN); - channel.read(buffer); - buffer.flip(); - - // Read vector based on format - return readVectorFromBuffer(buffer, index); + // Single assignment to final fields + this.dimension = tempDimension; + this.vectorCount = tempVectorCount; + this.vectorSize = tempVectorSize; + + log.info( + "StreamingVectorProvider initialized: {} vectors, {} dimensions, format: {}", + vectorCount, + dimension, + format); + } + + private int calculateVectorCount(int maxVectors, long vectorSize) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(filePath, "r")) { + long fileSize = raf.length(); + long totalVectors = fileSize / vectorSize; + return maxVectors > 0 ? Math.min((int) totalVectors, maxVectors) : (int) totalVectors; + } + } + + private int countVectorsInCompressedFile(int maxVectors, int dimension, long vectorSize) + throws IOException { + int count = 0; + try (FileInputStream fis = new FileInputStream(filePath); + GZIPInputStream gzis = new GZIPInputStream(fis)) { + + // Skip dimension of first vector (already read) + if (format == FileFormat.BVECS) { + gzis.skip(dimension); + } else { + gzis.skip(4L * dimension); + } + + byte[] buffer = new byte[4096]; + long bytesPerVector = vectorSize; + long totalBytesRead = 4 + (format == FileFormat.BVECS ? dimension : 4L * dimension); + + while (gzis.available() > 0 && (maxVectors <= 0 || count < maxVectors - 1)) { + int bytesRead = gzis.read(buffer); + if (bytesRead == -1) break; + totalBytesRead += bytesRead; + count = (int) (totalBytesRead / bytesPerVector); + if (maxVectors > 0 && count >= maxVectors) { + count = maxVectors; + break; } + } + return Math.max(1, count); } + } - private float[] readVectorFromBuffer(ByteBuffer buffer, int index) throws IOException { - float[] vector = new float[dimension]; + @Override + public float[] get(int index) throws IOException { + if (index < 0 || index >= vectorCount) { + throw new IndexOutOfBoundsException( + "Index " + index + " out of bounds [0, " + vectorCount + ")"); + } - switch (format) { - case FVECS: - // Read and verify dimension, then read floats - int fileDimension = buffer.getInt(); - if (fileDimension != dimension) { - throw new IOException("Dimension mismatch at vector " + index + - ": expected " + dimension + ", got " + fileDimension); - } - for (int i = 0; i < dimension; i++) { - vector[i] = buffer.getFloat(); - } - break; - - case FBIN: - // No dimension prefix, just read floats - for (int i = 0; i < dimension; i++) { - vector[i] = buffer.getFloat(); - } - break; - - case BVECS: - // Read and verify dimension, then read bytes as floats - int bvecsDimension = buffer.getInt(); - if (bvecsDimension != dimension) { - throw new IOException("Dimension mismatch at vector " + index + - ": expected " + dimension + ", got " + bvecsDimension); - } - for (int i = 0; i < dimension; i++) { - vector[i] = buffer.get() & 0xff; // Convert byte to unsigned int as float - } - break; - - case IVECS: - // Read and verify dimension, then read ints as floats - int ivecsDimension = buffer.getInt(); - if (ivecsDimension != dimension) { - throw new IOException("Dimension mismatch at vector " + index + - ": expected " + dimension + ", got " + ivecsDimension); - } - for (int i = 0; i < dimension; i++) { - vector[i] = (float) buffer.getInt(); - } - break; + if (isCompressed) { + return getFromCompressedFile(index); + } else { + return getFromUncompressedFile(index); + } + } + + private float[] getFromUncompressedFile(int index) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(filePath, "r"); + FileChannel channel = raf.getChannel()) { + + long position; + if (format == FileFormat.FBIN) { + // Skip initial header (8 bytes: num_vectors + dimension) and go to vector position + position = 8 + index * vectorSize; + } else { + // Standard position calculation for other formats + position = index * vectorSize; + } + + channel.position(position); + ByteBuffer buffer = ByteBuffer.allocate((int) vectorSize); + buffer.order(ByteOrder.LITTLE_ENDIAN); + channel.read(buffer); + buffer.flip(); + + // Read vector based on format + return readVectorFromBuffer(buffer, index); + } + } + + private float[] readVectorFromBuffer(ByteBuffer buffer, int index) throws IOException { + float[] vector = new float[dimension]; + + switch (format) { + case FVECS: + // Read and verify dimension, then read floats + int fileDimension = buffer.getInt(); + if (fileDimension != dimension) { + throw new IOException( + "Dimension mismatch at vector " + + index + + ": expected " + + dimension + + ", got " + + fileDimension); } + for (int i = 0; i < dimension; i++) { + vector[i] = buffer.getFloat(); + } + break; - return vector; + case FBIN: + // No dimension prefix, just read floats + for (int i = 0; i < dimension; i++) { + vector[i] = buffer.getFloat(); + } + break; + + case BVECS: + // Read and verify dimension, then read bytes as floats + int bvecsDimension = buffer.getInt(); + if (bvecsDimension != dimension) { + throw new IOException( + "Dimension mismatch at vector " + + index + + ": expected " + + dimension + + ", got " + + bvecsDimension); + } + for (int i = 0; i < dimension; i++) { + vector[i] = buffer.get() & 0xff; // Convert byte to unsigned int as float + } + break; + + case IVECS: + // Read and verify dimension, then read ints as floats + int ivecsDimension = buffer.getInt(); + if (ivecsDimension != dimension) { + throw new IOException( + "Dimension mismatch at vector " + + index + + ": expected " + + dimension + + ", got " + + ivecsDimension); + } + for (int i = 0; i < dimension; i++) { + vector[i] = (float) buffer.getInt(); + } + break; } - private float[] getFromCompressedFile(int index) throws IOException { - // For compressed files, we need to read sequentially from the beginning - try (FileInputStream fis = new FileInputStream(filePath); - GZIPInputStream gzis = new GZIPInputStream(fis)) { - - if (format == FileFormat.FBIN) { - // Skip initial header (8 bytes: num_vectors + dimension) - gzis.skip(8); - - // Read vectors until we reach the desired index - for (int i = 0; i <= index; i++) { - byte[] vectorBytes = new byte[dimension * 4]; - if (gzis.read(vectorBytes) != vectorBytes.length) { - throw new IOException("Unexpected end of file reading vector " + i); - } - - if (i == index) { - ByteBuffer buffer = ByteBuffer.wrap(vectorBytes); - buffer.order(ByteOrder.LITTLE_ENDIAN); - return readVectorFromBuffer(buffer, index); - } - } - } else { - // Handle other formats (fvecs, bvecs, ivecs) with dimension prefix per vector - ByteBuffer dimBuffer = ByteBuffer.allocate(4); - dimBuffer.order(ByteOrder.LITTLE_ENDIAN); - - for (int i = 0; i <= index; i++) { - // Read dimension - dimBuffer.clear(); - if (gzis.read(dimBuffer.array()) != 4) { - throw new IOException("Unexpected end of file reading vector " + i); - } - - int fileDimension = dimBuffer.getInt(); - if (fileDimension != dimension) { - throw new IOException("Dimension mismatch at vector " + i + - ": expected " + dimension + ", got " + fileDimension); - } - - // Read vector data - int dataSize = format == FileFormat.BVECS ? dimension : dimension * 4; - byte[] vectorBytes = new byte[dataSize]; - if (gzis.read(vectorBytes) != vectorBytes.length) { - throw new IOException("Unexpected end of file reading vector data for vector " + i); - } - - if (i == index) { - ByteBuffer vectorBuffer = ByteBuffer.wrap(vectorBytes); - vectorBuffer.order(ByteOrder.LITTLE_ENDIAN); - return readVectorFromBuffer(vectorBuffer, index); - } - } - } - } + return vector; + } - throw new IOException("Could not read vector " + index); - } + private float[] getFromCompressedFile(int index) throws IOException { + // For compressed files, we need to read sequentially from the beginning + try (FileInputStream fis = new FileInputStream(filePath); + GZIPInputStream gzis = new GZIPInputStream(fis)) { - /** - * Get raw integer array (useful for .ivecs/.ibin files used as ground truth) - */ - public int[] getIntArray(int index) throws IOException { - if (format != FileFormat.IVECS) { - throw new UnsupportedOperationException("getIntArray() only supported for .ivecs files"); - } + if (format == FileFormat.FBIN) { + // Skip initial header (8 bytes: num_vectors + dimension) + gzis.skip(8); - float[] floatVector = get(index); - int[] intVector = new int[floatVector.length]; - for (int i = 0; i < floatVector.length; i++) { - intVector[i] = (int) floatVector[i]; - } - return intVector; - } + // Read vectors until we reach the desired index + for (int i = 0; i <= index; i++) { + byte[] vectorBytes = new byte[dimension * 4]; + if (gzis.read(vectorBytes) != vectorBytes.length) { + throw new IOException("Unexpected end of file reading vector " + i); + } - @Override - public int size() { - return vectorCount; + if (i == index) { + ByteBuffer buffer = ByteBuffer.wrap(vectorBytes); + buffer.order(ByteOrder.LITTLE_ENDIAN); + return readVectorFromBuffer(buffer, index); + } + } + } else { + // Handle other formats (fvecs, bvecs, ivecs) with dimension prefix per vector + ByteBuffer dimBuffer = ByteBuffer.allocate(4); + dimBuffer.order(ByteOrder.LITTLE_ENDIAN); + + for (int i = 0; i <= index; i++) { + // Read dimension + dimBuffer.clear(); + if (gzis.read(dimBuffer.array()) != 4) { + throw new IOException("Unexpected end of file reading vector " + i); + } + + int fileDimension = dimBuffer.getInt(); + if (fileDimension != dimension) { + throw new IOException( + "Dimension mismatch at vector " + + i + + ": expected " + + dimension + + ", got " + + fileDimension); + } + + // Read vector data + int dataSize = format == FileFormat.BVECS ? dimension : dimension * 4; + byte[] vectorBytes = new byte[dataSize]; + if (gzis.read(vectorBytes) != vectorBytes.length) { + throw new IOException("Unexpected end of file reading vector data for vector " + i); + } + + if (i == index) { + ByteBuffer vectorBuffer = ByteBuffer.wrap(vectorBytes); + vectorBuffer.order(ByteOrder.LITTLE_ENDIAN); + return readVectorFromBuffer(vectorBuffer, index); + } + } + } } - @Override - public void close() throws IOException { - // No resources to close for file-based streaming - } + throw new IOException("Could not read vector " + index); + } - public int getDimension() { - return dimension; + /** + * Get raw integer array (useful for .ivecs/.ibin files used as ground truth) + */ + public int[] getIntArray(int index) throws IOException { + if (format != FileFormat.IVECS) { + throw new UnsupportedOperationException("getIntArray() only supported for .ivecs files"); } - public FileFormat getFormat() { - return format; + float[] floatVector = get(index); + int[] intVector = new int[floatVector.length]; + for (int i = 0; i < floatVector.length; i++) { + intVector[i] = (int) floatVector[i]; } + return intVector; + } + + @Override + public int size() { + return vectorCount; + } + + @Override + public void close() throws IOException { + // No resources to close for file-based streaming + } + + public int getDimension() { + return dimension; + } + + public FileFormat getFormat() { + return format; + } } - diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/Util.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/Util.java index 04ad3cd..f9d2000 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/Util.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/Util.java @@ -1,5 +1,13 @@ package com.searchscale.lucene.cuvs.benchmarks; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.dataformat.csv.CsvMapper; +import com.fasterxml.jackson.dataformat.csv.CsvSchema; +import com.opencsv.CSVReader; +import com.opencsv.exceptions.CsvValidationException; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; @@ -11,28 +19,18 @@ import java.util.Map; import java.util.zip.GZIPInputStream; import java.util.zip.ZipFile; - import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.ArrayUtils; -import org.mapdb.IndexTreeList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.MapperFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; -import com.fasterxml.jackson.dataformat.csv.CsvMapper; -import com.fasterxml.jackson.dataformat.csv.CsvSchema; -import com.opencsv.CSVReader; -import com.opencsv.exceptions.CsvValidationException; - public class Util { private static final Logger log = LoggerFactory.getLogger(Util.class.getName()); public static final int DEFAULT_BUFFER_SIZE = 65536; - public static void parseCSVFile(BenchmarkConfiguration config, List titles, List vectors) + public static void parseCSVFile( + BenchmarkConfiguration config, List titles, List vectors) throws IOException, CsvValidationException { InputStreamReader isr = null; ZipFile zipFile = null; @@ -55,7 +53,7 @@ public static void parseCSVFile(BenchmarkConfiguration config, List titl int countOfDocuments = 0; while ((csvLine = csvReader.readNext()) != null) { if ((countOfDocuments++) == 0) // skip the first line of the file, it is a header - continue; + continue; try { titles.add(csvLine[1]); vectors.add(Util.parseFloatArrayFromStringArray(csvLine[config.indexOfVector])); @@ -63,16 +61,13 @@ public static void parseCSVFile(BenchmarkConfiguration config, List titl System.out.print("#"); countOfDocuments -= 1; } - if (countOfDocuments % 1000 == 0) - System.out.print("."); + if (countOfDocuments % 1000 == 0) System.out.print("."); - if (countOfDocuments == config.numDocs + 1) - break; + if (countOfDocuments == config.numDocs + 1) break; } System.out.println(); } - if (zipFile != null) - zipFile.close(); + if (zipFile != null) zipFile.close(); } public static void writeCSV(List list, String filename) throws Exception { @@ -85,9 +80,12 @@ public static void writeCSV(List list, String filename) throws Exce JsonNode jsonTree = newObjectMapper().readTree(newObjectMapper().writeValueAsString(list)); CsvSchema.Builder csvSchemaBuilder = CsvSchema.builder(); JsonNode firstObject = jsonTree.elements().next(); - firstObject.fieldNames().forEachRemaining(fieldName -> { - csvSchemaBuilder.addColumn(fieldName); - }); + firstObject + .fieldNames() + .forEachRemaining( + fieldName -> { + csvSchemaBuilder.addColumn(fieldName); + }); CsvSchema csvSchema = csvSchemaBuilder.build().withHeader(); CsvMapper csvMapper = new CsvMapper(); csvMapper.writerFor(JsonNode.class).with(csvSchema).writeValue(new File(filename), jsonTree); @@ -101,8 +99,11 @@ static ObjectMapper newObjectMapper() { } public static float[] parseFloatArrayFromStringArray(String str) { - float[] arr = ArrayUtils.toPrimitive( - Arrays.stream(str.replace("[", "").replace("]", "").split(", ")).map(Float::valueOf).toArray(Float[]::new)); + float[] arr = + ArrayUtils.toPrimitive( + Arrays.stream(str.replace("[", "").replace("]", "").split(", ")) + .map(Float::valueOf) + .toArray(Float[]::new)); return arr; } @@ -119,7 +120,8 @@ public static List readGroundTruthFile(String groundTruthFile) throws IOE List rst = new ArrayList(); if (groundTruthFile.endsWith("csv")) { log.info("Seems like a csv groundtruth file. Reading ..."); - for (String line : FileUtils.readFileToString(new File(groundTruthFile), "UTF-8").split("\n")) { + for (String line : + FileUtils.readFileToString(new File(groundTruthFile), "UTF-8").split("\n")) { rst.add(Util.parseIntArrayFromStringArray(line)); } } else if (groundTruthFile.endsWith("ivecs")) { @@ -129,13 +131,15 @@ public static List readGroundTruthFile(String groundTruthFile) throws IOE log.info("Seems like a ibin groundtruth file. Reading ..."); rst = FBIvecsReader.readIbin(groundTruthFile, -1); } else { - throw new RuntimeException("Not parsing groundtruth file and stopping. Are you passing the correct file path?"); + throw new RuntimeException( + "Not parsing groundtruth file and stopping. Are you passing the correct file path?"); } log.info("{} number of entries in the groundtruth file.", rst.size()); return rst; } - public static void readBaseFile(BenchmarkConfiguration config, List titles, List vectors) { + public static void readBaseFile( + BenchmarkConfiguration config, List titles, List vectors) { if (config.datasetFile.contains("fvecs")) { log.info("Seems like an fvecs base file. Reading ..."); FBIvecsReader.readFvecs(config.datasetFile, config.numDocs, vectors); @@ -167,19 +171,19 @@ public static void preCheck(BenchmarkConfiguration config) { /** * Adds recall values to the metrics map - * + * * @param queryResults * @param metrics */ - public static void calculateRecallAccuracy(List queryResults, Map metrics, - boolean useCuVS) { + public static void calculateRecallAccuracy( + List queryResults, Map metrics, boolean useCuVS) { double totalRecall = 0; for (QueryResult result : queryResults) { totalRecall += result.getRecall(); } - double percentRecallAccuracy = (totalRecall / (double)queryResults.size()) * 100.0; + double percentRecallAccuracy = (totalRecall / (double) queryResults.size()) * 100.0; metrics.put((useCuVS ? "cuvs" : "hnsw") + "-recall-accuracy", percentRecallAccuracy); } } diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/VectorProvider.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/VectorProvider.java index 7162ff0..4696b42 100644 --- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/VectorProvider.java +++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/VectorProvider.java @@ -1,23 +1,23 @@ -package com.searchscale.lucene.cuvs.benchmarks; - -import java.io.IOException; - -/** - * Interface for providing vectors from various sources (MapDB, direct file streaming, etc.) - */ -public interface VectorProvider { - /** - * Get the vector at the specified index - */ - float[] get(int index) throws IOException; - - /** - * Get the total number of vectors - */ - int size(); - - /** - * Close any resources - */ - void close() throws IOException; -} \ No newline at end of file +package com.searchscale.lucene.cuvs.benchmarks; + +import java.io.IOException; + +/** + * Interface for providing vectors from various sources (MapDB, direct file streaming, etc.) + */ +public interface VectorProvider { + /** + * Get the vector at the specified index + */ + float[] get(int index) throws IOException; + + /** + * Get the total number of vectors + */ + int size(); + + /** + * Close any resources + */ + void close() throws IOException; +} diff --git a/sweeps.json b/sweeps.json index 23608e4..1a1b037 100644 --- a/sweeps.json +++ b/sweeps.json @@ -14,14 +14,17 @@ "cleanIndexDirectory": true, "saveResultsOnDisk": true, "forceMerge": 0, - "enableTieredMerge": true + "enableTieredMerge": true, + "enableIndexWriterInfoStream": false, + "ramBufferSizeMB": 32767 }, "algorithms": { "LUCENE_HNSW": { "hnswIndexDirPath": "hnswIndex", "hnswMaxConn": [16, 32, 64, 96, 128], "hnswBeamWidth": [64, 128, 256], - "numIndexThreads": 32 + "numIndexThreads": 32, + "hnswMergeThreads": 1 }, "CAGRA_HNSW": { "cuvsIndexDirPath": "cuvsIndex", diff --git a/trials/0/sweeps_trial_cag-hnsw.json b/trials/0/sweeps_trial_cag-hnsw.json index c629dba..73e5dc6 100644 --- a/trials/0/sweeps_trial_cag-hnsw.json +++ b/trials/0/sweeps_trial_cag-hnsw.json @@ -14,16 +14,18 @@ "cleanIndexDirectory": true, "saveResultsOnDisk": true, "forceMerge": 0, - "enableTieredMerge": true + "enableTieredMerge": true, + "enableIndexWriterInfoStream": false, + "ramBufferSizeMB": 10240 }, "algorithms": { "CAGRA_HNSW": { "cuvsIndexDirPath": "cuvsIndex", - "cagraGraphDegree": [64], - "cagraIntermediateGraphDegree": [128], - "cagraHnswLayers": [2], - "cuvsWriterThreads": 32, - "numIndexThreads": 32, + "cagraGraphDegree": [32], + "cagraIntermediateGraphDegree": [96], + "cagraHnswLayers": [1], + "cuvsWriterThreads": 14, + "numIndexThreads": 14, "hnswMaxConn": 5, "hnswBeamWidth": 5 } diff --git a/trials/0/sweeps_trial_luc-hnsw.json b/trials/0/sweeps_trial_luc-hnsw.json index 4a2693e..3964481 100644 --- a/trials/0/sweeps_trial_luc-hnsw.json +++ b/trials/0/sweeps_trial_luc-hnsw.json @@ -14,14 +14,17 @@ "cleanIndexDirectory": true, "saveResultsOnDisk": true, "forceMerge": 0, - "enableTieredMerge": true + "enableTieredMerge": false, + "enableIndexWriterInfoStream": false, + "ramBufferSizeMB": 32767 }, "algorithms": { "LUCENE_HNSW": { "hnswIndexDirPath": "hnswIndex", "hnswMaxConn": [128], "hnswBeamWidth": [256], - "numIndexThreads": 32, + "numIndexThreads": 8, + "hnswMergeThreads": 8, "cagraGraphDegree": [32], "cagraIntermediateGraphDegree": [64], "cagraHnswLayers": [2],