Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
653d21e
First take at exhaustive TAAT
elshize Dec 20, 2018
8f0b54a
Fetch an entire block at a time for TAAT
elshize Dec 22, 2018
9f4e325
Add prefetching for TAAT
elshize Dec 22, 2018
c28e5d2
Return buffer references from posting lists instead of moved buffers.
elshize Dec 22, 2018
5d5519b
Blocked accumulator array for TAAT
elshize Dec 22, 2018
886e580
TAAT maxscore and lazy accumulator
elshize Dec 27, 2018
b0e2824
TAAT MaxScore and Blocked Accumulator
elshize Dec 28, 2018
7737d1f
Remove heap stuff
elshize Dec 28, 2018
0664cba
TAAT optimizations
elshize Dec 29, 2018
9ae6b1a
Vectorize lookup traversal.
elshize Dec 29, 2018
26a05f1
Simple but effective ranked_or with taat
amallia Jan 15, 2019
41ca198
Remove OpenMP
elshize Jan 15, 2019
200be05
Remove OpenMP
elshize Jan 15, 2019
c3400d1
Use template rather than std::function for faster processing
elshize Jan 15, 2019
87252f3
Lazy accumulator fixed
elshize Jan 15, 2019
b9dd1fd
Fix block traversal issue
elshize Jan 16, 2019
5c539b4
Removed ds2i namespace
amallia Jan 16, 2019
0c36632
Moved algos
amallia Jan 16, 2019
6e96d16
Merge with master
amallia Jan 16, 2019
f2c93f9
code cleanup
amallia Jan 16, 2019
7ca53d5
Added comment [skip ci]
amallia Jan 16, 2019
30782ac
Improved queries interface
amallia Jan 16, 2019
3649a7d
removed unused lambdas
amallia Jan 16, 2019
81db028
Removed buffers
amallia Jan 16, 2019
09fbc02
Merge branch 'master' into taat
elshize Jan 17, 2019
f1a8b64
Simplified Maxscore
amallia Jan 19, 2019
eeacec4
Merge branch 'master' into taat
amallia Jan 21, 2019
d08a063
Delete exhaustive_taat_query.hpp
amallia Jan 21, 2019
f254277
Update test_ranked_queries.cpp
amallia Jan 21, 2019
b92e93d
Update test_ranked_queries.cpp
amallia Jan 21, 2019
c8c1b35
Faster MaxScore
amallia Jan 22, 2019
e7cd360
Merge branch 'master' into taat
amallia Jan 22, 2019
e60142b
Added extra check
amallia Jan 22, 2019
a1d25d4
Merge branch 'master' into taat
elshize Jan 25, 2019
2e12eff
Merge remote-tracking branch 'origin/master' into taat
elshize Jan 25, 2019
18be6a0
Use int for 8 bits
elshize Jan 25, 2019
48a0f80
Bug fix: reset counter
elshize Jan 26, 2019
abf75fc
Merge branch 'master' into taat
amallia Jan 27, 2019
cb67ba9
Merge branch 'master' into taat
amallia Jan 29, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions include/pisa/accumulator/blocked_accumulator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#pragma once

namespace pisa {

template <int block_size>
struct Blocked_Accumulator {

struct Proxy_Element {
std::ptrdiff_t document;
std::vector<float> &accumulators;
std::vector<float> &accumulators_max;

Proxy_Element &operator=(float score) {
accumulators[document] = score;
auto &block_max = accumulators_max[document / block_size];
if (score > block_max) {
block_max = score;
}
return *this;
}
Proxy_Element &operator+=(float delta) {
accumulators[document] += delta;
auto const&score = accumulators[document];
auto &block_max = accumulators_max[document / block_size];
if (score > block_max) {
block_max = score;
}
return *this;
}

operator float() { return accumulators[document]; }
};

using reference = Proxy_Element;

static_assert(block_size > 0, "must be positive");

[[nodiscard]] constexpr static auto calc_block_count(std::size_t size) noexcept -> std::size_t {
return (size + block_size - 1) / block_size;
}

Blocked_Accumulator(std::size_t size)
: m_size(size),
m_block_count(calc_block_count(size)), m_accumulators(size),
m_accumulators_max(m_block_count) {}

void init() { std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); }

[[nodiscard]] auto operator[](std::ptrdiff_t document) -> Proxy_Element
{
return {document, m_accumulators, m_accumulators_max};
}

void accumulate(std::ptrdiff_t const document, float score_delta)
{
m_accumulators[document] += score_delta;
auto const &score = m_accumulators[document];
auto &block_max = m_accumulators_max[document / block_size];
if (score > block_max) {
block_max = score;
}
}

void aggregate(topk_queue &topk) {
for (size_t block = 0; block < m_block_count; ++block) {
if (not topk.would_enter(m_accumulators_max[block])) { continue; }
uint32_t doc = block * block_size;
uint32_t end = std::min((block + 1) * block_size, m_accumulators.size());
for (; doc < end; ++doc) {
topk.insert(m_accumulators[doc], doc);
}
}
}

[[nodiscard]] auto size() noexcept -> std::size_t { return m_size; }

private:
std::size_t m_size;
std::size_t m_block_count;
std::vector<float> m_accumulators;
std::vector<float> m_accumulators_max;
};

} // pisa
145 changes: 145 additions & 0 deletions include/pisa/query/algorithm/maxscore_taat_query.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#pragma once

#include "topk_queue.hpp"
#include "util/intrinsics.hpp"

#include "accumulator/blocked_accumulator.hpp"
#include "accumulator/lazy_accumulator.hpp"
#include "accumulator/simple_accumulator.hpp"

namespace pisa {

template <typename Index, typename WandType>
[[nodiscard]] auto max_weights(Index const &index, WandType const &wdata, term_id_vec terms) {
// TODO(michal): parametrize scorer_type; didn't do that because this might mean some more
// complex refactoring I want to avoid for now.
using scorer_type = bm25;
using cursor_type = typename Index::document_enumerator;
using score_function_type = Score_Function<scorer_type, WandType>;

auto query_term_freqs = query_freqs(terms);
std::vector<float> max_weights;
max_weights.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), index.num_docs());
max_weights.push_back(q_weight * wdata.max_term_weight(term.first));
}
return max_weights;
}

template <typename Container, typename Function>
std::vector<std::size_t> sort_permutation(Container const &container, Function sort_function) {
std::vector<std::size_t> p(container.size());
std::iota(p.begin(), p.end(), 0);
std::sort(p.begin(), p.end(), [&](std::size_t i, std::size_t j) {
return sort_function(container[i], container[j]);
});
return p;
}

template <typename Container>
void apply_permutation(Container &container, const std::vector<std::size_t> &p) {
std::vector<bool> done(container.size());
for (std::size_t i = 0; i < container.size(); ++i) {
if (done[i]) {
continue;
}
done[i] = true;
std::size_t prev_j = i;
std::size_t j = p[i];
while (i != j) {
std::swap(container[prev_j], container[j]);
done[j] = true;
prev_j = j;
j = p[j];
}
}
}

template <typename Container, typename... Containers, typename Function>
void sort_many(Container &key_container, Function sort_function, Containers &... containers) {
auto permutation = sort_permutation(key_container, sort_function);
(apply_permutation(containers, permutation), ...);
}

template <typename Index, typename WandType, typename Acc = Simple_Accumulator>
class maxscore_taat_query {
using accumulator_reference = typename Acc::reference;
using score_function_type = Score_Function<bm25, WandType>;

public:
maxscore_taat_query(Index const &index, WandType const &wdata, uint64_t k)
: m_index(index), m_wdata(wdata), m_k(k), m_topk(k), m_accumulators(index.num_docs()) {}

uint64_t operator()(term_id_vec terms) {
m_topk.clear();
auto cws = query::cursors_with_scores(m_index, m_wdata, terms);
auto cursors = cws.first;
auto score_functions = cws.second;
auto m_w = max_weights(m_index, m_wdata, terms);
if (cursors.empty()) {
return 0;
}
sort_many(
m_w, [](auto lhs, auto rhs) { return lhs > rhs; }, cursors, score_functions);

float nonessential_sum = std::accumulate(m_w.begin(), m_w.end(), 0.0);
m_accumulators.init();
uint32_t term = 0;
for (; term < cursors.size(); ++term) {
if (not m_topk.would_enter(nonessential_sum)) {
break;
}
m_topk.clear();
auto cursor = cursors[term];
auto score = score_functions[term];
// TODO(antonio): basically here we can do a bit better.
// before scoring a document, we read its accumulator value and check if the sum of
// the accumulator value and the upper bound of the maxscores of the missing terms
// (current included) is greater than the threshold. If it is we score and add it to the
// accumulator, we go to the next document otherwise.
for (; cursor.docid() < m_accumulators.size(); cursor.next()) {
if(m_topk.would_enter(nonessential_sum + m_accumulators[cursor.docid()])) {
m_accumulators.accumulate(cursor.docid(), score(cursor.docid(), cursor.freq()));
m_topk.insert(m_accumulators[cursor.docid()]);
}
}
nonessential_sum -= m_w[term];
}

for (; term < cursors.size(); ++term) {
auto cursor = cursors[term];
auto score = score_functions[term];
for (; cursor.docid() < m_accumulators.size(); cursor.next()) {
accumulator_reference accumulator = m_accumulators[cursor.docid()];
if (accumulator > 0) {
accumulator += score(cursor.docid(), cursor.freq());
}
}
}

m_topk.clear();
m_accumulators.aggregate(m_topk);
m_topk.finalize();
return m_topk.topk().size();
}


std::vector<std::pair<float, uint64_t>> const &topk() const { return m_topk.topk(); }

private:
Index const & m_index;
WandType const &m_wdata;
int m_k;
topk_queue m_topk;
Acc m_accumulators;
};

template <typename Acc, typename Index, typename WandType>
[[nodiscard]] auto make_maxscore_taat_query(Index const &index, WandType const &wdata, uint64_t k) {
return maxscore_taat_query<Index, WandType, Acc>(index, wdata, k);
}

}; // namespace pisa
1 change: 1 addition & 0 deletions include/pisa/query/queries.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,4 @@ template <typename Index, typename WandType>
#include "algorithm/ranked_or_query.hpp"
#include "algorithm/wand_query.hpp"
#include "algorithm/ranked_or_taat_query.hpp"
#include "algorithm/maxscore_taat_query.hpp"
8 changes: 8 additions & 0 deletions src/queries.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ void perftest(const std::string &index_filename,
} else if (t == "ranked_or_taat_lazy" && wand_data_filename) {
query_fun =
pisa::make_ranked_or_taat_query<pisa::Lazy_Accumulator<4>>(index, wdata, k);
} else if (t == "ranked_or_taat_blocked" && wand_data_filename) {
query_fun =
pisa::make_ranked_or_taat_query<pisa::Blocked_Accumulator<1024>>(index, wdata, k);
} else if (t == "maxscore_taat" && wand_data_filename) {
query_fun = pisa::make_maxscore_taat_query<pisa::Simple_Accumulator>(index, wdata, k);
} else if (t == "maxscore_taat_blocked" && wand_data_filename) {
query_fun =
pisa::make_maxscore_taat_query<pisa::Blocked_Accumulator<1024>>(index, wdata, k);
} else {
spdlog::error("Unsupported query type: {}", t);
break;
Expand Down
30 changes: 29 additions & 1 deletion test/test_ranked_queries.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,27 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "block_max_maxscore")
test_against_or(bmm_q);
}

TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore_taat")
{
pisa::ranked_or_taat_query<index_type, WandType, pisa::Simple_Accumulator> ranked_or_taat_q(
index, wdata, 10);
test_against_or(ranked_or_taat_q);
}

TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat_lazy")
{
pisa::ranked_or_taat_query<index_type, WandType, pisa::Lazy_Accumulator<8>> ranked_or_taat_q(
index, wdata, 10);
test_against_or(ranked_or_taat_q);
}

TEST_CASE_METHOD(pisa::test::index_initialization, "maxscore_taat_blocked")
{
pisa::maxscore_taat_query<index_type, WandType, pisa::Blocked_Accumulator<1024>>
taat_q(index, wdata, 10);
test_against_or(taat_q);
}

TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat")
{

Expand All @@ -108,7 +129,14 @@ TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat")
test_against_or(ranked_or_taat_q);
}

TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat_lazy")
TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat_blocked")
{
pisa::ranked_or_taat_query<index_type, WandType, pisa::Blocked_Accumulator<1024>>
ranked_or_taat_q(index, wdata, 10);
test_against_or(ranked_or_taat_q);
}

TEST_CASE_METHOD(pisa::test::index_initialization, "ranked_or_taat_query_lazy")
{
pisa::ranked_or_taat_query<index_type, WandType, pisa::Lazy_Accumulator<8>> ranked_or_taat_q(
index, wdata, 10);
Expand Down