A single-header C++ parser for the TREC Web format.
[[nodiscard]] auto read_record(std::istream&) -> Result;Reads a record from the current position in the stream.
[[nodiscard]] auto read_subsequent_record(std::istream&) -> Result;Same as read_record but will skip any junk before the first
valid beginning of a record (<DOC> tag).
Result is an alias for std::variant<Record, Error>.
For convenience, match function is provided to enable basic pattern matching.
The following example shows how to iterate over a TREC collection
and retrieve the total size of all valid records.
#include <iostream>
#include <trecpp/trecpp.hpp>
using trecpp::match;
using trecpp::Result;
using trecpp::Error;
using trecpp::Record;
std::size_t total_content_size(std::ifstream& is)
{
std::size_t total{0u};
while (not is.eof())
{
total += match(
trecpp::read_subsequent_record(in), // will skip to first valid beginning
[](Record const &rec) -> std::size_t {
return rec.content_length();
},
[](Error const &error) -> std::size_t {
std::clog << "Error: " << error << '\n';
return 0u;
});
}
return total;
}Alternatively, you can use holds_record and std::get<T>(std::variant)
functions to access returned data:
#include <iostream>
#include <trecpp/trecpp.hpp>
using trecpp::holds_record;
std::size_t total_content_size(std::ifstream& is)
{
std::size_t total{0u};
while (not is.eof())
{
auto record = trecpp::read_subsequent_record(in);
if (holds_record(record)) {
total += std::get<Record>(record).content_length();
}
else {
// You actually don't need to cast in order to print
std::clog << "Warn: " << record << '\n';
}
}
return total;
}