Skip to content
Merged
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,8 @@ Optimizations
and uses an optimized branchless approach. Any subclasses that have implemented the optimized method
need to remove it as it will disappear in Lucene 11. (Uwe Schindler)

* GITHUB#15167: FirstPassGroupingCollector supports ignoring docs without group field (Binlong Gao)

Changes in Runtime Behavior
---------------------
* GITHUB#14823: Decrease TieredMergePolicy's default number of segments per
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.mutable.MutableValue;

/**
* FirstPassGroupingCollector is the first of two passes necessary to collect grouped hits. This
Expand All @@ -44,6 +45,7 @@
public class FirstPassGroupingCollector<T> extends SimpleCollector {

private final GroupSelector<T> groupSelector;
private final boolean ignoreDocsWithoutGroupField;

private final FieldComparator<?>[] comparators;
private final LeafFieldComparator[] leafComparators;
Expand Down Expand Up @@ -74,7 +76,28 @@ public class FirstPassGroupingCollector<T> extends SimpleCollector {
@SuppressWarnings({"unchecked", "rawtypes"})
public FirstPassGroupingCollector(
GroupSelector<T> groupSelector, Sort groupSort, int topNGroups) {
this(groupSelector, groupSort, topNGroups, false);
}

/**
* Create the first pass collector with ignoreDocsWithoutGroupField
*
* @param groupSelector a GroupSelector used to defined groups
* @param groupSort The {@link Sort} used to sort the groups. The top sorted document within each
* group according to groupSort, determines how that group sorts against other groups. This
* must be non-null, ie, if you want to groupSort by relevance use Sort.RELEVANCE.
* @param topNGroups How many top groups to keep.
* @param ignoreDocsWithoutGroupField if true, ignore documents that don't have the group field
* instead of putting them in a null group
*/
@SuppressWarnings({"unchecked", "rawtypes"})
public FirstPassGroupingCollector(
GroupSelector<T> groupSelector,
Sort groupSort,
int topNGroups,
boolean ignoreDocsWithoutGroupField) {
this.groupSelector = groupSelector;
this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField;
if (topNGroups < 1) {
throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
}
Expand Down Expand Up @@ -198,12 +221,14 @@ public void collect(int doc) throws IOException {
return;
}

// TODO: should we add option to mean "ignore docs that
// don't have the group field" (instead of stuffing them
// under null group)?
groupSelector.advanceTo(doc);
T groupValue = groupSelector.currentValue();

// Skip documents without group field if option is enabled
if (ignoreDocsWithoutGroupField && isNullGroupValue(groupValue)) {
return;
}

final CollectedSearchGroup<T> group = groupMap.get(groupValue);

if (group == null) {
Expand Down Expand Up @@ -363,4 +388,15 @@ protected void doSetNextReader(LeafReaderContext readerContext) throws IOExcepti
public GroupSelector<T> getGroupSelector() {
return groupSelector;
}

private boolean isNullGroupValue(T groupValue) {
if (groupValue == null) {
return true;
}
// For ValueSourceGroupSelector, check if MutableValue exists
if (groupValue instanceof MutableValue) {
return !((MutableValue) groupValue).exists();
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class GroupingSearch {
private boolean cacheScores;
private boolean allGroups;
private boolean allGroupHeads;
private boolean ignoreDocsWithoutGroupField;

private Collection<?> matchingGroups;
private Bits matchingGroupHeads;
Expand Down Expand Up @@ -138,7 +139,7 @@ protected TopGroups groupByFieldOrFunction(
int topN = groupOffset + groupLimit;

final FirstPassGroupingCollector firstPassCollector =
new FirstPassGroupingCollector(grouper, groupSort, topN);
new FirstPassGroupingCollector(grouper, groupSort, topN, ignoreDocsWithoutGroupField);
final AllGroupsCollector allGroupsCollector =
allGroups ? new AllGroupsCollector(grouper) : null;
final AllGroupHeadsCollector allGroupHeadsCollector =
Expand Down Expand Up @@ -358,4 +359,16 @@ public GroupingSearch setAllGroupHeads(boolean allGroupHeads) {
public Bits getAllGroupHeads() {
return matchingGroupHeads;
}

/**
* Whether to ignore documents that don't have the group field instead of putting them in a null
* group.
*
* @param ignoreDocsWithoutGroupField Whether to ignore documents without group field
* @return <code>this</code>
*/
public GroupingSearch setIgnoreDocsWithoutGroupField(boolean ignoreDocsWithoutGroupField) {
this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,49 @@ private void indexRandomDocs(RandomIndexWriter w) throws IOException {
}
}

public void testIgnoreDocsWithoutGroupField() throws IOException {
Shard shard = new Shard();

// Add documents with group field
Document doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
addGroupField(doc, 1);
shard.writer.addDocument(doc);

doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
addGroupField(doc, 2);
shard.writer.addDocument(doc);

// Add document without group field
doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
shard.writer.addDocument(doc);

IndexSearcher searcher = shard.getIndexSearcher();
Query query = new TermQuery(new Term("text", "foo"));

// Test default behavior (include null group)
GroupingSearch grouping1 = new GroupingSearch(getGroupSelector());
TopGroups<T> groups1 = grouping1.search(searcher, query, 0, 10);
int defaultGroupCount = groups1.groups.length;

// Test ignoring docs without group field
GroupingSearch grouping2 = new GroupingSearch(getGroupSelector());
grouping2.setIgnoreDocsWithoutGroupField(true);
TopGroups<T> groups2 = grouping2.search(searcher, query, 0, 10);
int ignoreGroupCount = groups2.groups.length;

assertTrue(
"Expected ignoreGroupCount <= defaultGroupCount, got "
+ ignoreGroupCount
+ " vs "
+ defaultGroupCount,
ignoreGroupCount <= defaultGroupCount);

shard.close();
}

private void assertSortsBefore(GroupDocs<T> first, GroupDocs<T> second) {
Object[] groupSortValues = second.groupSortValues();
Object[] prevSortValues = first.groupSortValues();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
Expand Down Expand Up @@ -193,6 +194,84 @@ public void testBasic() throws Exception {
dir.close();
}

public void testIgnoreDocsWithoutGroupField() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w =
new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())));

String groupField = "group";
// Add documents with group field
Document doc = new Document();
addGroupField(doc, groupField, "group1");
// doc.add(new SortedDocValuesField("group", new BytesRef("group1")));
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

doc = new Document();
addGroupField(doc, groupField, "group2");
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

// Add document without group field
doc = new Document();
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

DirectoryReader reader = w.getReader();
w.close();

IndexSearcher searcher = newSearcher(reader);

// Test default behavior (include null group)
FirstPassGroupingCollector<BytesRef> collector1 =
new FirstPassGroupingCollector<>(new TermGroupSelector(groupField), Sort.RELEVANCE, 10);
searcher.search(new MatchAllDocsQuery(), collector1);
Collection<SearchGroup<BytesRef>> groups1 = collector1.getTopGroups(0);

assertEquals(3, groups1.size()); // Should include null group

// Test ignoring docs without group field
FirstPassGroupingCollector<BytesRef> collector2 =
new FirstPassGroupingCollector<>(
new TermGroupSelector(groupField), Sort.RELEVANCE, 10, true);
searcher.search(new MatchAllDocsQuery(), collector2);
Collection<SearchGroup<BytesRef>> groups2 = collector2.getTopGroups(0);

assertEquals(2, groups2.size()); // Should exclude null group

reader.close();
dir.close();
}

public void testAllDocsWithoutGroupField() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w =
new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())));

// Add documents without group field
for (int i = 0; i < 5; i++) {
Document doc = new Document();
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);
}

DirectoryReader reader = w.getReader();
w.close();

IndexSearcher searcher = newSearcher(reader);

// Test ignoring docs without group field when all docs lack the field
FirstPassGroupingCollector<BytesRef> collector =
new FirstPassGroupingCollector<>(new TermGroupSelector("group"), Sort.RELEVANCE, 10, true);
searcher.search(new MatchAllDocsQuery(), collector);
Collection<SearchGroup<BytesRef>> groups = collector.getTopGroups(0);

assertNull(groups); // Should return null when no groups found

reader.close();
dir.close();
}

private void addGroupField(Document doc, String groupField, String value) {
doc.add(new SortedDocValuesField(groupField, new BytesRef(value)));
}
Expand Down
Loading