Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,32 @@

<build>
<plugins>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>2.44.5</version>
<executions>
<execution>
<goals>
<goal>apply</goal>
</goals>
<phase>validate</phase>
</execution>
</executions>
<configuration>
<java>
<includes>
<include>src/**/*.java</include>
</includes>
<googleJavaFormat>
<version>1.27.0</version>
<style>GOOGLE</style>
<reflowLongStrings>true</reflowLongStrings>
<formatJavadoc>false</formatJavadoc>
</googleJavaFormat>
</java>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,34 @@ public class BenchmarkConfiguration {
public boolean saveResultsOnDisk;
public String resultsDirectory;
public boolean hasColNames;
public String algoToRun; // keep as String
public String algoToRun; // keep as String
public String groundTruthFile;
public String cuvsIndexDirPath;
public String hnswIndexDirPath;
public boolean loadVectorsInMemory;
public boolean skipIndexing;
public int forceMerge;
public boolean enableTieredMerge;
public boolean enableIndexWriterInfoStream;
public int ramBufferSizeMB;

// Lucene HNSW parameters
public int hnswMaxConn; // 16 default (max 512)
public int hnswBeamWidth; // 100 default (max 3200)
public int hnswMaxConn; // 16 default (max 512)
public int hnswBeamWidth; // 100 default (max 3200)
public int hnswMergeThreads;

// CAGRA parameters
public int cagraIntermediateGraphDegree; // 128 default
public int cagraGraphDegree; // 64 default
public int cagraGraphDegree; // 64 default
public int cagraITopK;
public int cagraSearchWidth;
public int cagraHnswLayers; // layers in CAGRA->HNSW conversion
public int cagraHnswLayers; // layers in CAGRA->HNSW conversion
public int efSearch;

private boolean isLucene() {
return "LUCENE_HNSW".equalsIgnoreCase(algoToRun);
}

private boolean isCagra() {
return "CAGRA_HNSW".equalsIgnoreCase(algoToRun);
}
Expand Down Expand Up @@ -77,12 +81,22 @@ public String prettyString() {
sb.append("Has column names in the dataset file: ").append(hasColNames).append('\n');
sb.append("algoToRun {Choices: HNSW | CAGRA}: ").append(algoToRun).append('\n');
sb.append("Ground Truth file used is: ").append(groundTruthFile).append('\n');
if (cuvsIndexDirPath != null) sb.append("CuVS index directory path is: ").append(cuvsIndexDirPath).append('\n');
if (hnswIndexDirPath != null) sb.append("HNSW index directory path is: ").append(hnswIndexDirPath).append('\n');
if (cuvsIndexDirPath != null)
sb.append("CuVS index directory path is: ").append(cuvsIndexDirPath).append('\n');
if (hnswIndexDirPath != null)
sb.append("HNSW index directory path is: ").append(hnswIndexDirPath).append('\n');
sb.append("Load vectors in memory before indexing: ").append(loadVectorsInMemory).append('\n');
sb.append("Skip indexing (and use existing index for search): ").append(skipIndexing).append('\n');
sb.append("Do force merge while indexing documents [a value < 1 implies no force merge]: ").append(forceMerge).append('\n');

sb.append("Skip indexing (and use existing index for search): ")
.append(skipIndexing)
.append('\n');
sb.append("Do force merge while indexing documents [a value < 1 implies no force merge]: ")
.append(forceMerge)
.append('\n');
sb.append("Enable TieredMerge: ").append(enableTieredMerge).append('\n');
sb.append("Num merge threads: ").append(hnswMergeThreads).append('\n');
sb.append("enableIndexWriterInfoStream: ").append(enableIndexWriterInfoStream).append('\n');
sb.append("ramBufferSizeMB: ").append(ramBufferSizeMB).append('\n');

sb.append("------- algo parameters ------\n");
if (isLucene()) {
sb.append("hnswMaxConn: ").append(hnswMaxConn).append('\n');
Expand All @@ -98,7 +112,10 @@ public String prettyString() {
return sb.toString();
}

@Override public String toString() { return prettyString(); }
@Override
public String toString() {
return prettyString();
}

public void debugPrintArguments() {
// keep a single source of truth for printing
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.mapdb.IndexTreeList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

//TODO: The three static methods have a lot of common logic, ideally should be combined as just one.
// TODO: The three static methods have a lot of common logic, ideally should be combined as just
// one.
public class FBIvecsReader {

private static final Logger log = LoggerFactory.getLogger(FBIvecsReader.class.getName());
Expand Down Expand Up @@ -176,92 +175,91 @@ public static void readBvecs(String filePath, int numRows, List<float[]> vectors
}

// New method to read .fbin files (format: num_vectors, dimension, then vector data)
// Corrected readFbin method for Wiki-88M .fbin files
public static void readFbin(String filePath, int numRows, List<float[]> vectors) {
log.info("Reading {} from file: {}", numRows, filePath);

try (InputStream is = new FileInputStream(filePath)) {
// Read num_vectors (first 4 bytes, little endian)
byte[] numVecBytes = is.readNBytes(4);
ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN);
int numVectors = numVecBuffer.getInt();

// Read dimension (next 4 bytes, little endian)
byte[] dimBytes = is.readNBytes(4);
ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN);
int dimension = dimBuffer.getInt();

log.info("File header - total vectors: {}, dimension: {}", numVectors, dimension);

float[] row = new float[dimension];
int count = 0;

while (is.available() != 0) {
byte[] vectorBytes = is.readNBytes(dimension * 4);
if (vectorBytes.length != dimension * 4) break;
ByteBuffer bb = ByteBuffer.wrap(vectorBytes).order(ByteOrder.LITTLE_ENDIAN);
for (int i = 0; i < dimension; i++) row[i] = bb.getFloat();
vectors.add(row.clone());
count++;
if (numRows != -1 && count == numRows) break;
if (count % 1000 == 0) System.out.print(".");
// Corrected readFbin method for Wiki-88M .fbin files
public static void readFbin(String filePath, int numRows, List<float[]> vectors) {
log.info("Reading {} from file: {}", numRows, filePath);

try (InputStream is = new FileInputStream(filePath)) {
// Read num_vectors (first 4 bytes, little endian)
byte[] numVecBytes = is.readNBytes(4);
ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN);
int numVectors = numVecBuffer.getInt();

// Read dimension (next 4 bytes, little endian)
byte[] dimBytes = is.readNBytes(4);
ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN);
int dimension = dimBuffer.getInt();

log.info("File header - total vectors: {}, dimension: {}", numVectors, dimension);

float[] row = new float[dimension];
int count = 0;

while (is.available() != 0) {
byte[] vectorBytes = is.readNBytes(dimension * 4);
if (vectorBytes.length != dimension * 4) break;
ByteBuffer bb = ByteBuffer.wrap(vectorBytes).order(ByteOrder.LITTLE_ENDIAN);
for (int i = 0; i < dimension; i++) row[i] = bb.getFloat();
vectors.add(row.clone());
count++;
if (numRows != -1 && count == numRows) break;
if (count % 1000 == 0) System.out.print(".");
}
System.out.println();
log.info("Reading complete. Read {} vectors out of {} in file.", count, numVectors);
} catch (Exception e) {
log.error("Error reading fbin file", e);
}
System.out.println();
log.info("Reading complete. Read {} vectors out of {} in file.", count, numVectors);
} catch (Exception e) {
log.error("Error reading fbin file", e);
}
}

// Fixed method to read .ibin files (ground truth neighbors)
public static ArrayList<int[]> readIbin(String filePath, int numRows) {
log.info("Reading {} from file: {}", numRows, filePath);
ArrayList<int[]> vectors = new ArrayList<int[]>();
// Fixed method to read .ibin files (ground truth neighbors)
public static ArrayList<int[]> readIbin(String filePath, int numRows) {
log.info("Reading {} from file: {}", numRows, filePath);
ArrayList<int[]> vectors = new ArrayList<int[]>();

try {
InputStream is = new FileInputStream(filePath);
try {
InputStream is = new FileInputStream(filePath);

// For .ibin ground truth files: Read num_vectors first, then dimension
byte[] numVecBytes = is.readNBytes(4);
ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN);
int numVectors = numVecBuffer.getInt();
// For .ibin ground truth files: Read num_vectors first, then dimension
byte[] numVecBytes = is.readNBytes(4);
ByteBuffer numVecBuffer = ByteBuffer.wrap(numVecBytes).order(ByteOrder.LITTLE_ENDIAN);
int numVectors = numVecBuffer.getInt();

byte[] dimBytes = is.readNBytes(4);
ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN);
int dimension = dimBuffer.getInt();
byte[] dimBytes = is.readNBytes(4);
ByteBuffer dimBuffer = ByteBuffer.wrap(dimBytes).order(ByteOrder.LITTLE_ENDIAN);
int dimension = dimBuffer.getInt();

log.info("Ground truth file - total vectors: {}, dimension: {}", numVectors, dimension);
log.info("Ground truth file - total vectors: {}, dimension: {}", numVectors, dimension);

int count = 0;
while (is.available() != 0 && (numRows == -1 || count < numRows)) {
// Read dimension * 4 bytes (int values)
byte[] vectorBytes = is.readNBytes(dimension * 4);
if (vectorBytes.length != dimension * 4) {
break; // End of file
}
int count = 0;
while (is.available() != 0 && (numRows == -1 || count < numRows)) {
// Read dimension * 4 bytes (int values)
byte[] vectorBytes = is.readNBytes(dimension * 4);
if (vectorBytes.length != dimension * 4) {
break; // End of file
}

ByteBuffer bb = ByteBuffer.wrap(vectorBytes);
bb.order(ByteOrder.LITTLE_ENDIAN);
ByteBuffer bb = ByteBuffer.wrap(vectorBytes);
bb.order(ByteOrder.LITTLE_ENDIAN);

int[] row = new int[dimension];
for (int i = 0; i < dimension; i++) {
row[i] = bb.getInt();
}
int[] row = new int[dimension];
for (int i = 0; i < dimension; i++) {
row[i] = bb.getInt();
}

vectors.add(row);
count++;
vectors.add(row);
count++;

if (count % 1000 == 0) {
System.out.print(".");
if (count % 1000 == 0) {
System.out.print(".");
}
}
System.out.println();
is.close();
log.info("Reading complete. Read {} vectors out of {} total.", count, numVectors);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println();
is.close();
log.info("Reading complete. Read {} vectors out of {} total.", count, numVectors);
} catch (Exception e) {
e.printStackTrace();
return vectors;
}
return vectors;
}

}
Loading