Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ Optimizations
* GITHUB#15151: Use `SimScorer#score` bulk API to compute impact scores per
block of postings. (Adrien Grand)

* GITHUB#15167: FirstPassGroupingCollector supports ignoring docs without group field (Binlong Gao)

* GITHUB#15160: Increased the size used for blocks of postings from 128 to 256.
This gives a noticeable speedup to many queries. (Adrien Grand)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.mutable.MutableValue;

/**
* FirstPassGroupingCollector is the first of two passes necessary to collect grouped hits. This
Expand All @@ -44,6 +45,7 @@
public class FirstPassGroupingCollector<T> extends SimpleCollector {

private final GroupSelector<T> groupSelector;
private final boolean ignoreDocsWithoutGroupField;

private final FieldComparator<?>[] comparators;
private final LeafFieldComparator[] leafComparators;
Expand Down Expand Up @@ -74,7 +76,28 @@ public class FirstPassGroupingCollector<T> extends SimpleCollector {
@SuppressWarnings({"unchecked", "rawtypes"})
public FirstPassGroupingCollector(
GroupSelector<T> groupSelector, Sort groupSort, int topNGroups) {
this(groupSelector, groupSort, topNGroups, false);
}

/**
* Create the first pass collector with ignoreDocsWithoutGroupField
*
* @param groupSelector a GroupSelector used to defined groups
* @param groupSort The {@link Sort} used to sort the groups. The top sorted document within each
* group according to groupSort, determines how that group sorts against other groups. This
* must be non-null, ie, if you want to groupSort by relevance use Sort.RELEVANCE.
* @param topNGroups How many top groups to keep.
* @param ignoreDocsWithoutGroupField if true, ignore documents that don't have the group field
* instead of putting them in a null group
*/
@SuppressWarnings({"unchecked", "rawtypes"})
public FirstPassGroupingCollector(
GroupSelector<T> groupSelector,
Sort groupSort,
int topNGroups,
boolean ignoreDocsWithoutGroupField) {
this.groupSelector = groupSelector;
this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField;
if (topNGroups < 1) {
throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
}
Expand Down Expand Up @@ -198,12 +221,14 @@ public void collect(int doc) throws IOException {
return;
}

// TODO: should we add option to mean "ignore docs that
// don't have the group field" (instead of stuffing them
// under null group)?
groupSelector.advanceTo(doc);
T groupValue = groupSelector.currentValue();

// Skip documents without group field if option is enabled
if (ignoreDocsWithoutGroupField && isNullGroupValue(groupValue)) {
return;
}

final CollectedSearchGroup<T> group = groupMap.get(groupValue);

if (group == null) {
Expand Down Expand Up @@ -363,4 +388,15 @@ protected void doSetNextReader(LeafReaderContext readerContext) throws IOExcepti
public GroupSelector<T> getGroupSelector() {
return groupSelector;
}

private boolean isNullGroupValue(T groupValue) {
if (groupValue == null) {
return true;
}
// For ValueSourceGroupSelector, check if MutableValue exists
if (groupValue instanceof MutableValue mutable) {
return mutable.exists() == false;
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class GroupingSearch {
private boolean cacheScores;
private boolean allGroups;
private boolean allGroupHeads;
private boolean ignoreDocsWithoutGroupField;

private Collection<?> matchingGroups;
private Bits matchingGroupHeads;
Expand Down Expand Up @@ -138,7 +139,7 @@ protected TopGroups groupByFieldOrFunction(
int topN = groupOffset + groupLimit;

final FirstPassGroupingCollector firstPassCollector =
new FirstPassGroupingCollector(grouper, groupSort, topN);
new FirstPassGroupingCollector(grouper, groupSort, topN, ignoreDocsWithoutGroupField);
final AllGroupsCollector allGroupsCollector =
allGroups ? new AllGroupsCollector(grouper) : null;
final AllGroupHeadsCollector allGroupHeadsCollector =
Expand Down Expand Up @@ -358,4 +359,16 @@ public GroupingSearch setAllGroupHeads(boolean allGroupHeads) {
public Bits getAllGroupHeads() {
return matchingGroupHeads;
}

/**
* Whether to ignore documents that don't have the group field instead of putting them in a null
* group.
*
* @param ignoreDocsWithoutGroupField Whether to ignore documents without group field
* @return <code>this</code>
*/
public GroupingSearch setIgnoreDocsWithoutGroupField(boolean ignoreDocsWithoutGroupField) {
this.ignoreDocsWithoutGroupField = ignoreDocsWithoutGroupField;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,49 @@ private void indexRandomDocs(RandomIndexWriter w) throws IOException {
}
}

public void testIgnoreDocsWithoutGroupField() throws IOException {
Shard shard = new Shard();

// Add documents with group field
Document doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
addGroupField(doc, 1);
shard.writer.addDocument(doc);

doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
addGroupField(doc, 2);
shard.writer.addDocument(doc);

// Add document without group field
doc = new Document();
doc.add(new TextField("text", "foo", Field.Store.NO));
shard.writer.addDocument(doc);

IndexSearcher searcher = shard.getIndexSearcher();
Query query = new TermQuery(new Term("text", "foo"));

// Test default behavior (include null group)
GroupingSearch grouping1 = new GroupingSearch(getGroupSelector());
TopGroups<T> groups1 = grouping1.search(searcher, query, 0, 10);
int defaultGroupCount = groups1.groups.length;

// Test ignoring docs without group field
GroupingSearch grouping2 = new GroupingSearch(getGroupSelector());
grouping2.setIgnoreDocsWithoutGroupField(true);
TopGroups<T> groups2 = grouping2.search(searcher, query, 0, 10);
int ignoreGroupCount = groups2.groups.length;

assertTrue(
"Expected ignoreGroupCount <= defaultGroupCount, got "
+ ignoreGroupCount
+ " vs "
+ defaultGroupCount,
ignoreGroupCount <= defaultGroupCount);

shard.close();
}

private void assertSortsBefore(GroupDocs<T> first, GroupDocs<T> second) {
Object[] groupSortValues = second.groupSortValues();
Object[] prevSortValues = first.groupSortValues();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
Expand Down Expand Up @@ -193,6 +194,84 @@ public void testBasic() throws Exception {
dir.close();
}

public void testIgnoreDocsWithoutGroupField() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w =
new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())));

String groupField = "group";
// Add documents with group field
Document doc = new Document();
addGroupField(doc, groupField, "group1");
// doc.add(new SortedDocValuesField("group", new BytesRef("group1")));
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

doc = new Document();
addGroupField(doc, groupField, "group2");
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

// Add document without group field
doc = new Document();
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);

DirectoryReader reader = w.getReader();
w.close();

IndexSearcher searcher = newSearcher(reader);

// Test default behavior (include null group)
FirstPassGroupingCollector<BytesRef> collector1 =
new FirstPassGroupingCollector<>(new TermGroupSelector(groupField), Sort.RELEVANCE, 10);
searcher.search(new MatchAllDocsQuery(), collector1);
Collection<SearchGroup<BytesRef>> groups1 = collector1.getTopGroups(0);

assertEquals(3, groups1.size()); // Should include null group

// Test ignoring docs without group field
FirstPassGroupingCollector<BytesRef> collector2 =
new FirstPassGroupingCollector<>(
new TermGroupSelector(groupField), Sort.RELEVANCE, 10, true);
searcher.search(new MatchAllDocsQuery(), collector2);
Collection<SearchGroup<BytesRef>> groups2 = collector2.getTopGroups(0);

assertEquals(2, groups2.size()); // Should exclude null group

reader.close();
dir.close();
}

public void testAllDocsWithoutGroupField() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w =
new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())));

// Add documents without group field
for (int i = 0; i < 5; i++) {
Document doc = new Document();
doc.add(new TextField("content", "test", Field.Store.YES));
w.addDocument(doc);
}

DirectoryReader reader = w.getReader();
w.close();

IndexSearcher searcher = newSearcher(reader);

// Test ignoring docs without group field when all docs lack the field
FirstPassGroupingCollector<BytesRef> collector =
new FirstPassGroupingCollector<>(new TermGroupSelector("group"), Sort.RELEVANCE, 10, true);
searcher.search(new MatchAllDocsQuery(), collector);
Collection<SearchGroup<BytesRef>> groups = collector.getTopGroups(0);

assertNull(groups); // Should return null when no groups found

reader.close();
dir.close();
}

private void addGroupField(Document doc, String groupField, String value) {
doc.add(new SortedDocValuesField(groupField, new BytesRef(value)));
}
Expand Down
Loading