-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Switching to a fixed CFS threshold #15295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ca79b46
127204b
98ea06e
f14f19c
b26495e
9fae54a
18d2582
7af6917
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,8 @@ | |
| package org.apache.lucene.codecs; | ||
|
|
||
| import java.io.IOException; | ||
| import org.apache.lucene.index.LogDocMergePolicy; | ||
| import org.apache.lucene.index.MergePolicy; | ||
| import org.apache.lucene.index.SegmentInfo; | ||
| import org.apache.lucene.store.Directory; | ||
| import org.apache.lucene.store.IOContext; | ||
|
|
@@ -34,6 +36,152 @@ protected CompoundFormat() {} | |
| // TODO: this is very minimal. If we need more methods, | ||
| // we can add 'producer' classes. | ||
|
|
||
| /** Default document count threshold for using compound files with LogDocMergePolicy */ | ||
| static final int DEFAULT_CFS_THRESHOLD_DOC_SIZE = 65536; // docs | ||
|
|
||
| /** Default byte size threshold for using compound files with other merge policies (64MB) */ | ||
| static final long DEFAULT_CFS_THRESHOLD_BYTE_SIZE = 64L * 1024 * 1024; // 64MB | ||
|
|
||
| /** Default maximum segment size allowed for compound files (no limit) */ | ||
| static final long DEFAULT_MAX_CFS_SEGMENT_SIZE = Long.MAX_VALUE; | ||
|
|
||
| /** Document count threshold for LogDocMergePolicy */ | ||
| private int cfsThresholdDocSize = DEFAULT_CFS_THRESHOLD_DOC_SIZE; | ||
|
|
||
| /** Byte size threshold for other merge policies */ | ||
| private long cfsThresholdByteSize = DEFAULT_CFS_THRESHOLD_BYTE_SIZE; | ||
|
|
||
| /** Whether compound files should be used at all */ | ||
| private boolean shouldUseCompoundFile = true; | ||
|
|
||
| /** Maximum segment size that can be stored as compound file */ | ||
| private long maxCFSSegmentSize = DEFAULT_MAX_CFS_SEGMENT_SIZE; | ||
|
|
||
| /** | ||
| * Sets the document count threshold for using compound files with LogDocMergePolicy. Segments | ||
| * with document count less than or equal to this threshold will use compound files. | ||
| * | ||
| * @param threshold the document count threshold | ||
| */ | ||
| public void setCfsThresholdDocSize(int threshold) { | ||
| this.cfsThresholdDocSize = threshold; | ||
| } | ||
|
|
||
| /** | ||
| * Sets the byte size threshold for using compound files with merge policies other than | ||
| * LogDocMergePolicy. Segments with size less than or equal to this threshold will use compound | ||
| * files. | ||
| * | ||
| * @param thresholdBytes the byte size threshold in bytes | ||
| */ | ||
| public void setCfsThresholdByteSize(long thresholdBytes) { | ||
| this.cfsThresholdByteSize = thresholdBytes; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the current document count threshold for compound files. | ||
| * | ||
| * @return the document count threshold | ||
| */ | ||
| public int getCfsThresholdDocSize() { | ||
| return this.cfsThresholdDocSize; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the current byte size threshold for compound files. | ||
| * | ||
| * @return the byte size threshold in bytes | ||
| */ | ||
| public long getCfsThresholdByteSize() { | ||
| return this.cfsThresholdByteSize; | ||
| } | ||
|
|
||
| /** | ||
| * Enables or disables the use of compound files entirely. When disabled, no segments will use | ||
| * compound files regardless of other settings. | ||
| * | ||
| * @param useCompoundFile true to enable compound files, false to disable | ||
| */ | ||
| public void setShouldUseCompoundFile(boolean useCompoundFile) { | ||
| this.shouldUseCompoundFile = useCompoundFile; | ||
| } | ||
|
|
||
| /** | ||
| * Returns whether compound files are enabled. | ||
| * | ||
| * @return true if compound files are enabled, false otherwise | ||
| */ | ||
| public boolean getShouldUseCompoundFile() { | ||
| return this.shouldUseCompoundFile; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the largest size allowed for a compound file segment in megabytes. Segments larger than | ||
| * this size will not use compound files even if otherwise eligible. | ||
| * | ||
| * @return the maximum compound file segment size in MB | ||
| */ | ||
| public double getMaxCFSSegmentSizeMB() { | ||
| return maxCFSSegmentSize / 1024. / 1024.; | ||
| } | ||
|
|
||
| /** | ||
| * Sets the maximum size limit for compound file segments in megabytes. If a merged segment will | ||
| * be larger than this value, it will be left as a non-compound file even if compound files are | ||
| * enabled. Set this to Double.POSITIVE_INFINITY (default) to always use CFS when other conditions | ||
| * are met. | ||
| * | ||
| * @param v the maximum segment size in MB (must be >= 0) | ||
| * @throws IllegalArgumentException if v is negative | ||
| */ | ||
| public void setMaxCFSSegmentSizeMB(double v) { | ||
| if (v < 0.0) { | ||
| throw new IllegalArgumentException("maxCFSSegmentSizeMB must be >=0 (got " + v + ")"); | ||
| } | ||
| v *= 1024 * 1024; // Convert MB to bytes | ||
| this.maxCFSSegmentSize = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v; | ||
| } | ||
|
|
||
| /** | ||
| * Determines whether a segment should use the compound file format based on its size and merge | ||
| * policy. | ||
| * | ||
| * <p>The decision logic is as follows: | ||
| * | ||
| * <ol> | ||
| * <li>If compound files are disabled globally, return false | ||
| * <li>If segment size exceeds the maximum CFS segment size, return false | ||
| * <li>For LogDocMergePolicy: use CFS if document count ≤ document threshold | ||
| * <li>For other merge policies: use CFS if byte size ≤ byte threshold | ||
| * </ol> | ||
| * | ||
| * @param mergedInfoSize the size of the segment (document count for LogDocMergePolicy, bytes for | ||
| * others) | ||
| * @param mergePolicy the merge policy being used | ||
| * @return true if the segment should use compound file format, false otherwise | ||
| * @throws IOException if an I/O error occurs | ||
| */ | ||
| public boolean useCompoundFile(long mergedInfoSize, MergePolicy mergePolicy) throws IOException { | ||
| // Check if compound files are globally disabled | ||
| if (this.shouldUseCompoundFile == false) { | ||
| return false; | ||
| } | ||
|
|
||
| // Check if segment exceeds maximum allowed size for CFS | ||
| if (mergedInfoSize > maxCFSSegmentSize) { | ||
| return false; | ||
| } | ||
|
|
||
| // Apply appropriate threshold based on merge policy type | ||
| if (mergePolicy instanceof LogDocMergePolicy) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be great if we can avoid customizing it for specific policies, otherwise it might be tricky to maintain in the future, if e.g. there is another policy that is based on doc size not bytes. Maybe we can add a enum and a method to MergePolicy which returns its unit ( Or do we want to always choose compound format based on size in bytes even for LogDocMergePolicy? In this case we might be able to use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 to the idea of Enums, I will wait if anyone else has other suggestions here but having an enum makes most sense to me. |
||
| // For LogDocMergePolicy, mergedInfoSize represents document count | ||
| return mergedInfoSize <= this.cfsThresholdDocSize; | ||
| } else { | ||
| // For other policies, mergedInfoSize represents byte size | ||
| return mergedInfoSize <= this.cfsThresholdByteSize; | ||
| } | ||
| } | ||
|
|
||
| /** Returns a Directory view (read-only) for the compound files in this segment */ | ||
| public abstract CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) | ||
| throws IOException; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -368,9 +368,8 @@ public InfoStream getInfoStream() { | |
| * | ||
| * <p>Use <code>false</code> for batch indexing with very large ram buffer settings. | ||
| * | ||
| * <p><b>Note: To control compound file usage during segment merges see {@link | ||
| * MergePolicy#setNoCFSRatio(double)} and {@link MergePolicy#setMaxCFSSegmentSizeMB(double)}. This | ||
| * setting only applies to newly created segments.</b> | ||
| * <p><b>Note: To control compound file usage during segment merges. More here: | ||
| * lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java</b> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use the |
||
| */ | ||
| public LiveIndexWriterConfig setUseCompoundFile(boolean useCompoundFile) { | ||
| this.useCompoundFile = useCompoundFile; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.