diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index abb1b5a0b509..a13919276562 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -138,6 +138,8 @@ Optimizations * GITHUB#15151: Use `SimScorer#score` bulk API to compute impact scores per block of postings. (Adrien Grand) +* GITHUB#15167: FirstPassGroupingCollector supports ignoring docs without group field (Binlong Gao) + * GITHUB#15160: Increased the size used for blocks of postings from 128 to 256. This gives a noticeable speedup to many queries. (Adrien Grand) diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java index 399ff885b3ad..0ff86f84b3f3 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java @@ -32,6 +32,7 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.util.CollectionUtil; +import org.apache.lucene.util.mutable.MutableValue; /** * FirstPassGroupingCollector is the first of two passes necessary to collect grouped hits. This @@ -44,6 +45,7 @@ public class FirstPassGroupingCollector extends SimpleCollector { private final GroupSelector groupSelector; + private final boolean ignoreDocsWithoutGroupField; private final FieldComparator[] comparators; private final LeafFieldComparator[] leafComparators; @@ -74,7 +76,28 @@ public class FirstPassGroupingCollector extends SimpleCollector { @SuppressWarnings({"unchecked", "rawtypes"}) public FirstPassGroupingCollector( GroupSelector groupSelector, Sort groupSort, int topNGroups) { + this(groupSelector, groupSort, topNGroups, false); + } + + /** + * Create the first pass collector with ignoreDocsWithoutGroupField + * + * @param groupSelector a GroupSelector used to defined groups + * @param groupSort The {@link Sort} used to sort the groups. The top sorted document within each + * group according to groupSort, determines how that group sorts against other groups. This + * must be non-null, ie, if you want to groupSort by relevance use Sort.RELEVANCE. + * @param topNGroups How many top groups to keep. + * @param ignoreDocsWithoutGroupField if true, ignore documents that don't have the group field + * instead of putting them in a null group + */ + @SuppressWarnings({"unchecked", "rawtypes"}) + public FirstPassGroupingCollector( + GroupSelector groupSelector, + Sort groupSort, + int topNGroups, + boolean ignoreDocsWithoutGroupField) { this.groupSelector = groupSelector; + this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField; if (topNGroups < 1) { throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")"); } @@ -198,12 +221,14 @@ public void collect(int doc) throws IOException { return; } - // TODO: should we add option to mean "ignore docs that - // don't have the group field" (instead of stuffing them - // under null group)? groupSelector.advanceTo(doc); T groupValue = groupSelector.currentValue(); + // Skip documents without group field if option is enabled + if (ignoreDocsWithoutGroupField && isNullGroupValue(groupValue)) { + return; + } + final CollectedSearchGroup group = groupMap.get(groupValue); if (group == null) { @@ -363,4 +388,15 @@ protected void doSetNextReader(LeafReaderContext readerContext) throws IOExcepti public GroupSelector getGroupSelector() { return groupSelector; } + + private boolean isNullGroupValue(T groupValue) { + if (groupValue == null) { + return true; + } + // For ValueSourceGroupSelector, check if MutableValue exists + if (groupValue instanceof MutableValue mutable) { + return mutable.exists() == false; + } + return false; + } } diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java index ce97e2a87055..91bcbf56da84 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java @@ -56,6 +56,7 @@ public class GroupingSearch { private boolean cacheScores; private boolean allGroups; private boolean allGroupHeads; + private boolean ignoreDocsWithoutGroupField; private Collection matchingGroups; private Bits matchingGroupHeads; @@ -138,7 +139,7 @@ protected TopGroups groupByFieldOrFunction( int topN = groupOffset + groupLimit; final FirstPassGroupingCollector firstPassCollector = - new FirstPassGroupingCollector(grouper, groupSort, topN); + new FirstPassGroupingCollector(grouper, groupSort, topN, ignoreDocsWithoutGroupField); final AllGroupsCollector allGroupsCollector = allGroups ? new AllGroupsCollector(grouper) : null; final AllGroupHeadsCollector allGroupHeadsCollector = @@ -358,4 +359,16 @@ public GroupingSearch setAllGroupHeads(boolean allGroupHeads) { public Bits getAllGroupHeads() { return matchingGroupHeads; } + + /** + * Whether to ignore documents that don't have the group field instead of putting them in a null + * group. + * + * @param ignoreDocsWithoutGroupField Whether to ignore documents without group field + * @return this + */ + public GroupingSearch setIgnoreDocsWithoutGroupField(boolean ignoreDocsWithoutGroupField) { + this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField; + return this; + } } diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java index c64ffbf52f0b..687c7080dba2 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/BaseGroupSelectorTestCase.java @@ -379,6 +379,49 @@ private void indexRandomDocs(RandomIndexWriter w) throws IOException { } } + public void testIgnoreDocsWithoutGroupField() throws IOException { + Shard shard = new Shard(); + + // Add documents with group field + Document doc = new Document(); + doc.add(new TextField("text", "foo", Field.Store.NO)); + addGroupField(doc, 1); + shard.writer.addDocument(doc); + + doc = new Document(); + doc.add(new TextField("text", "foo", Field.Store.NO)); + addGroupField(doc, 2); + shard.writer.addDocument(doc); + + // Add document without group field + doc = new Document(); + doc.add(new TextField("text", "foo", Field.Store.NO)); + shard.writer.addDocument(doc); + + IndexSearcher searcher = shard.getIndexSearcher(); + Query query = new TermQuery(new Term("text", "foo")); + + // Test default behavior (include null group) + GroupingSearch grouping1 = new GroupingSearch(getGroupSelector()); + TopGroups groups1 = grouping1.search(searcher, query, 0, 10); + int defaultGroupCount = groups1.groups.length; + + // Test ignoring docs without group field + GroupingSearch grouping2 = new GroupingSearch(getGroupSelector()); + grouping2.setIgnoreDocsWithoutGroupField(true); + TopGroups groups2 = grouping2.search(searcher, query, 0, 10); + int ignoreGroupCount = groups2.groups.length; + + assertTrue( + "Expected ignoreGroupCount <= defaultGroupCount, got " + + ignoreGroupCount + + " vs " + + defaultGroupCount, + ignoreGroupCount <= defaultGroupCount); + + shard.close(); + } + private void assertSortsBefore(GroupDocs first, GroupDocs second) { Object[] groupSortValues = second.groupSortValues(); Object[] prevSortValues = first.groupSortValues(); diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java index a351c6e02137..592e6ce92b43 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java @@ -49,6 +49,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiCollector; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -193,6 +194,84 @@ public void testBasic() throws Exception { dir.close(); } + public void testIgnoreDocsWithoutGroupField() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = + new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random()))); + + String groupField = "group"; + // Add documents with group field + Document doc = new Document(); + addGroupField(doc, groupField, "group1"); + // doc.add(new SortedDocValuesField("group", new BytesRef("group1"))); + doc.add(new TextField("content", "test", Field.Store.YES)); + w.addDocument(doc); + + doc = new Document(); + addGroupField(doc, groupField, "group2"); + doc.add(new TextField("content", "test", Field.Store.YES)); + w.addDocument(doc); + + // Add document without group field + doc = new Document(); + doc.add(new TextField("content", "test", Field.Store.YES)); + w.addDocument(doc); + + DirectoryReader reader = w.getReader(); + w.close(); + + IndexSearcher searcher = newSearcher(reader); + + // Test default behavior (include null group) + FirstPassGroupingCollector collector1 = + new FirstPassGroupingCollector<>(new TermGroupSelector(groupField), Sort.RELEVANCE, 10); + searcher.search(new MatchAllDocsQuery(), collector1); + Collection> groups1 = collector1.getTopGroups(0); + + assertEquals(3, groups1.size()); // Should include null group + + // Test ignoring docs without group field + FirstPassGroupingCollector collector2 = + new FirstPassGroupingCollector<>( + new TermGroupSelector(groupField), Sort.RELEVANCE, 10, true); + searcher.search(new MatchAllDocsQuery(), collector2); + Collection> groups2 = collector2.getTopGroups(0); + + assertEquals(2, groups2.size()); // Should exclude null group + + reader.close(); + dir.close(); + } + + public void testAllDocsWithoutGroupField() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = + new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random()))); + + // Add documents without group field + for (int i = 0; i < 5; i++) { + Document doc = new Document(); + doc.add(new TextField("content", "test", Field.Store.YES)); + w.addDocument(doc); + } + + DirectoryReader reader = w.getReader(); + w.close(); + + IndexSearcher searcher = newSearcher(reader); + + // Test ignoring docs without group field when all docs lack the field + FirstPassGroupingCollector collector = + new FirstPassGroupingCollector<>(new TermGroupSelector("group"), Sort.RELEVANCE, 10, true); + searcher.search(new MatchAllDocsQuery(), collector); + Collection> groups = collector.getTopGroups(0); + + assertNull(groups); // Should return null when no groups found + + reader.close(); + dir.close(); + } + private void addGroupField(Document doc, String groupField, String value) { doc.add(new SortedDocValuesField(groupField, new BytesRef(value))); }