From a1c583d6a73b81afbda5e61ca48b1ec4856cd578 Mon Sep 17 00:00:00 2001 From: Binlong Gao Date: Fri, 5 Sep 2025 17:35:56 +0800 Subject: [PATCH 1/4] FirstPassGroupingCollector supports ignoring docs without group field Signed-off-by: Binlong Gao --- .../grouping/FirstPassGroupingCollector.java | 42 +++++++++- .../search/grouping/GroupingSearch.java | 15 +++- .../grouping/BaseGroupSelectorTestCase.java | 43 ++++++++++ .../lucene/search/grouping/TestGrouping.java | 79 +++++++++++++++++++ 4 files changed, 175 insertions(+), 4 deletions(-) diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java index 399ff885b3ad..5c1397127758 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java @@ -32,6 +32,7 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.util.CollectionUtil; +import org.apache.lucene.util.mutable.MutableValue; /** * FirstPassGroupingCollector is the first of two passes necessary to collect grouped hits. This @@ -44,6 +45,7 @@ public class FirstPassGroupingCollector extends SimpleCollector { private final GroupSelector groupSelector; + private final boolean ignoreDocsWithoutGroupField; private final FieldComparator[] comparators; private final LeafFieldComparator[] leafComparators; @@ -74,7 +76,28 @@ public class FirstPassGroupingCollector extends SimpleCollector { @SuppressWarnings({"unchecked", "rawtypes"}) public FirstPassGroupingCollector( GroupSelector groupSelector, Sort groupSort, int topNGroups) { + this(groupSelector, groupSort, topNGroups, false); + } + + /** + * Create the first pass collector with ignoreDocsWithoutGroupField + * + * @param groupSelector a GroupSelector used to defined groups + * @param groupSort The {@link Sort} used to sort the groups. The top sorted document within each + * group according to groupSort, determines how that group sorts against other groups. This + * must be non-null, ie, if you want to groupSort by relevance use Sort.RELEVANCE. + * @param topNGroups How many top groups to keep. + * @param ignoreDocsWithoutGroupField if true, ignore documents that don't have the group field + * instead of putting them in a null group + */ + @SuppressWarnings({"unchecked", "rawtypes"}) + public FirstPassGroupingCollector( + GroupSelector groupSelector, + Sort groupSort, + int topNGroups, + boolean ignoreDocsWithoutGroupField) { this.groupSelector = groupSelector; + this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField; if (topNGroups < 1) { throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")"); } @@ -198,12 +221,14 @@ public void collect(int doc) throws IOException { return; } - // TODO: should we add option to mean "ignore docs that - // don't have the group field" (instead of stuffing them - // under null group)? groupSelector.advanceTo(doc); T groupValue = groupSelector.currentValue(); + // Skip documents without group field if option is enabled + if (ignoreDocsWithoutGroupField && isNullGroupValue(groupValue)) { + return; + } + final CollectedSearchGroup group = groupMap.get(groupValue); if (group == null) { @@ -363,4 +388,15 @@ protected void doSetNextReader(LeafReaderContext readerContext) throws IOExcepti public GroupSelector getGroupSelector() { return groupSelector; } + + private boolean isNullGroupValue(T groupValue) { + if (groupValue == null) { + return true; + } + // For ValueSourceGroupSelector, check if MutableValue exists + if (groupValue instanceof MutableValue) { + return !((MutableValue) groupValue).exists(); + } + return false; + } } diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java index ce97e2a87055..91bcbf56da84 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java @@ -56,6 +56,7 @@ public class GroupingSearch { private boolean cacheScores; private boolean allGroups; private boolean allGroupHeads; + private boolean ignoreDocsWithoutGroupField; private Collection matchingGroups; private Bits matchingGroupHeads; @@ -138,7 +139,7 @@ protected TopGroups groupByFieldOrFunction( int topN = groupOffset + groupLimit; final FirstPassGroupingCollector firstPassCollector = - new FirstPassGroupingCollector(grouper, groupSort, topN); + new FirstPassGroupingCollector(grouper, groupSort, topN, ignoreDocsWithoutGroupField); final AllGroupsCollector allGroupsCollector = allGroups ? new AllGroupsCollector(grouper) : null; final AllGroupHeadsCollector allGroupHeadsCollector = @@ -358,4 +359,16 @@ public GroupingSearch setAllGroupHeads(boolean allGroupHeads) { public Bits getAllGroupHeads() { return matchingGroupHeads; } + + /** + * Whether to ignore documents that don't have the group field instead of putting them in a null + * group. + * + * @param ignoreDocsWithoutGroupField Whether to ignore documents without group field + * @return this + */ + public GroupingSearch setIgnoreDocsWithoutGroupField(boolean ignoreDocsWithoutGroupField) { + this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField; + return this; + } } diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java index c64ffbf52f0b..687c7080dba2 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java @@ -379,6 +379,49 @@ private void indexRandomDocs(RandomIndexWriter w) throws IOException { } } + public void testIgnoreDocsWithoutGroupField() throws IOException { + Shard shard = new Shard(); + + // Add documents with group field + Document doc = new Document(); + doc.add(new TextField("text", "foo", Field.Store.NO)); + addGroupField(doc, 1); + shard.writer.addDocument(doc); + + doc = new Document(); + doc.add(new TextField("text", "foo", Field.Store.NO)); + addGroupField(doc, 2); + shard.writer.addDocument(doc); + + // Add document without group field + doc = new Document(); + doc.add(new TextField("text", "foo", Field.Store.NO)); + shard.writer.addDocument(doc); + + IndexSearcher searcher = shard.getIndexSearcher(); + Query query = new TermQuery(new Term("text", "foo")); + + // Test default behavior (include null group) + GroupingSearch grouping1 = new GroupingSearch(getGroupSelector()); + TopGroups groups1 = grouping1.search(searcher, query, 0, 10); + int defaultGroupCount = groups1.groups.length; + + // Test ignoring docs without group field + GroupingSearch grouping2 = new GroupingSearch(getGroupSelector()); + grouping2.setIgnoreDocsWithoutGroupField(true); + TopGroups groups2 = grouping2.search(searcher, query, 0, 10); + int ignoreGroupCount = groups2.groups.length; + + assertTrue( + "Expected ignoreGroupCount <= defaultGroupCount, got " + + ignoreGroupCount + + " vs " + + defaultGroupCount, + ignoreGroupCount <= defaultGroupCount); + + shard.close(); + } + private void assertSortsBefore(GroupDocs first, GroupDocs second) { Object[] groupSortValues = second.groupSortValues(); Object[] prevSortValues = first.groupSortValues(); diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java index a351c6e02137..592e6ce92b43 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java @@ -49,6 +49,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiCollector; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -193,6 +194,84 @@ public void testBasic() throws Exception { dir.close(); } + public void testIgnoreDocsWithoutGroupField() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = + new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random()))); + + String groupField = "group"; + // Add documents with group field + Document doc = new Document(); + addGroupField(doc, groupField, "group1"); + // doc.add(new SortedDocValuesField("group", new BytesRef("group1"))); + doc.add(new TextField("content", "test", Field.Store.YES)); + w.addDocument(doc); + + doc = new Document(); + addGroupField(doc, groupField, "group2"); + doc.add(new TextField("content", "test", Field.Store.YES)); + w.addDocument(doc); + + // Add document without group field + doc = new Document(); + doc.add(new TextField("content", "test", Field.Store.YES)); + w.addDocument(doc); + + DirectoryReader reader = w.getReader(); + w.close(); + + IndexSearcher searcher = newSearcher(reader); + + // Test default behavior (include null group) + FirstPassGroupingCollector collector1 = + new FirstPassGroupingCollector<>(new TermGroupSelector(groupField), Sort.RELEVANCE, 10); + searcher.search(new MatchAllDocsQuery(), collector1); + Collection> groups1 = collector1.getTopGroups(0); + + assertEquals(3, groups1.size()); // Should include null group + + // Test ignoring docs without group field + FirstPassGroupingCollector collector2 = + new FirstPassGroupingCollector<>( + new TermGroupSelector(groupField), Sort.RELEVANCE, 10, true); + searcher.search(new MatchAllDocsQuery(), collector2); + Collection> groups2 = collector2.getTopGroups(0); + + assertEquals(2, groups2.size()); // Should exclude null group + + reader.close(); + dir.close(); + } + + public void testAllDocsWithoutGroupField() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = + new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random()))); + + // Add documents without group field + for (int i = 0; i < 5; i++) { + Document doc = new Document(); + doc.add(new TextField("content", "test", Field.Store.YES)); + w.addDocument(doc); + } + + DirectoryReader reader = w.getReader(); + w.close(); + + IndexSearcher searcher = newSearcher(reader); + + // Test ignoring docs without group field when all docs lack the field + FirstPassGroupingCollector collector = + new FirstPassGroupingCollector<>(new TermGroupSelector("group"), Sort.RELEVANCE, 10, true); + searcher.search(new MatchAllDocsQuery(), collector); + Collection> groups = collector.getTopGroups(0); + + assertNull(groups); // Should return null when no groups found + + reader.close(); + dir.close(); + } + private void addGroupField(Document doc, String groupField, String value) { doc.add(new SortedDocValuesField(groupField, new BytesRef(value))); } From 8b278a211f57390b27f66d3a34a80c74c60f1dc5 Mon Sep 17 00:00:00 2001 From: Binlong Gao Date: Tue, 9 Sep 2025 16:51:30 +0800 Subject: [PATCH 2/4] Add change log Signed-off-by: Binlong Gao --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ec8b1c2586a5..e4cdfa380783 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -315,6 +315,8 @@ Optimizations and uses an optimized branchless approach. Any subclasses that have implemented the optimized method need to remove it as it will disappear in Lucene 11. (Uwe Schindler) +* GITHUB#15167: FirstPassGroupingCollector supports ignoring docs without group field (Binlong Gao) + Changes in Runtime Behavior --------------------- * GITHUB#14823: Decrease TieredMergePolicy's default number of segments per From 12e8a6707c454c6431f662fd0d364d71dd2e6929 Mon Sep 17 00:00:00 2001 From: Binlong Gao Date: Tue, 9 Sep 2025 20:47:46 +0800 Subject: [PATCH 3/4] Optimize some code Signed-off-by: Binlong Gao --- lucene/CHANGES.txt | 4 ++-- .../lucene/search/grouping/FirstPassGroupingCollector.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e4cdfa380783..df2da2076a2e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -134,6 +134,8 @@ Optimizations * GITHUB#15151: Use `SimScorer#score` bulk API to compute impact scores per block of postings. (Adrien Grand) +* GITHUB#15167: FirstPassGroupingCollector supports ignoring docs without group field (Binlong Gao) + Bug Fixes --------------------- * GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException @@ -315,8 +317,6 @@ Optimizations and uses an optimized branchless approach. Any subclasses that have implemented the optimized method need to remove it as it will disappear in Lucene 11. (Uwe Schindler) -* GITHUB#15167: FirstPassGroupingCollector supports ignoring docs without group field (Binlong Gao) - Changes in Runtime Behavior --------------------- * GITHUB#14823: Decrease TieredMergePolicy's default number of segments per diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java index 5c1397127758..0ff86f84b3f3 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java @@ -394,8 +394,8 @@ private boolean isNullGroupValue(T groupValue) { return true; } // For ValueSourceGroupSelector, check if MutableValue exists - if (groupValue instanceof MutableValue) { - return !((MutableValue) groupValue).exists(); + if (groupValue instanceof MutableValue mutable) { + return mutable.exists() == false; } return false; } From d70487a00986b08cc0de8e6cdb0ec303a0f1e459 Mon Sep 17 00:00:00 2001 From: Binlong Gao Date: Fri, 19 Sep 2025 15:48:33 +0800 Subject: [PATCH 4/4] Use the return value of GroupSelector.advanceTo() to check null group value Signed-off-by: Binlong Gao --- .../grouping/DoubleRangeGroupSelector.java | 3 +-- .../grouping/FirstPassGroupingCollector.java | 16 ++-------------- .../search/grouping/LongRangeGroupSelector.java | 3 +-- .../grouping/ValueSourceGroupSelector.java | 13 +++++++++++-- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeGroupSelector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeGroupSelector.java index bacf9a66fdbc..bf2b5c4ccea1 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeGroupSelector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeGroupSelector.java @@ -33,7 +33,7 @@ public class DoubleRangeGroupSelector extends GroupSelector { private final DoubleRangeFactory rangeFactory; private Set inSecondPass; - private boolean includeEmpty = true; + private boolean includeEmpty; private boolean positioned; private DoubleRange current; @@ -88,7 +88,6 @@ public DoubleRange copyValue() throws IOException { @Override public void setGroups(Collection> searchGroups) { inSecondPass = new HashSet<>(); - includeEmpty = false; for (SearchGroup group : searchGroups) { if (group.groupValue == null) { includeEmpty = true; diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java index 0ff86f84b3f3..d1a89e7a0f05 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java @@ -32,7 +32,6 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.util.CollectionUtil; -import org.apache.lucene.util.mutable.MutableValue; /** * FirstPassGroupingCollector is the first of two passes necessary to collect grouped hits. This @@ -221,11 +220,11 @@ public void collect(int doc) throws IOException { return; } - groupSelector.advanceTo(doc); + GroupSelector.State state = groupSelector.advanceTo(doc); T groupValue = groupSelector.currentValue(); // Skip documents without group field if option is enabled - if (ignoreDocsWithoutGroupField && isNullGroupValue(groupValue)) { + if (ignoreDocsWithoutGroupField && state == GroupSelector.State.SKIP) { return; } @@ -388,15 +387,4 @@ protected void doSetNextReader(LeafReaderContext readerContext) throws IOExcepti public GroupSelector getGroupSelector() { return groupSelector; } - - private boolean isNullGroupValue(T groupValue) { - if (groupValue == null) { - return true; - } - // For ValueSourceGroupSelector, check if MutableValue exists - if (groupValue instanceof MutableValue mutable) { - return mutable.exists() == false; - } - return false; - } } diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/LongRangeGroupSelector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/LongRangeGroupSelector.java index ba8623d71283..81763aaa18e7 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/LongRangeGroupSelector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/LongRangeGroupSelector.java @@ -34,7 +34,7 @@ public class LongRangeGroupSelector extends GroupSelector { private final LongRangeFactory rangeFactory; private Set inSecondPass; - private boolean includeEmpty = true; + private boolean includeEmpty; private boolean positioned; private LongRange current; @@ -89,7 +89,6 @@ public LongRange copyValue() throws IOException { @Override public void setGroups(Collection> searchGroups) { inSecondPass = new HashSet<>(); - includeEmpty = false; for (SearchGroup group : searchGroups) { if (group.groupValue == null) { includeEmpty = true; diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/ValueSourceGroupSelector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/ValueSourceGroupSelector.java index 63e95d8bccbc..2f131b8aa8f2 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/ValueSourceGroupSelector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/ValueSourceGroupSelector.java @@ -35,6 +35,7 @@ public class ValueSourceGroupSelector extends GroupSelector { private final Map context; private Set secondPassGroups; + private boolean includeEmpty; /** * Create a new ValueSourceGroupSelector @@ -61,8 +62,12 @@ public void setScorer(Scorable scorer) throws IOException {} @Override public State advanceTo(int doc) throws IOException { this.filler.fillValue(doc); + MutableValue value = filler.getValue(); + if (value.exists() == false) { + return includeEmpty ? State.ACCEPT : State.SKIP; + } if (secondPassGroups != null) { - if (secondPassGroups.contains(filler.getValue()) == false) { + if (secondPassGroups.contains(value) == false) { return State.SKIP; } } @@ -83,7 +88,11 @@ public MutableValue copyValue() { public void setGroups(Collection> searchGroups) { secondPassGroups = new HashSet<>(); for (SearchGroup group : searchGroups) { - secondPassGroups.add(group.groupValue); + if (!group.groupValue.exists()) { + includeEmpty = true; + } else { + secondPassGroups.add(group.groupValue); + } } } }