diff --git a/.gitignore b/.gitignore index a9d37c5..bc5c4e7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ target Cargo.lock + +**/*.iml +.idea/ +*.reject +.DS_Store + diff --git a/Cargo.toml b/Cargo.toml index 6ced680..48b6003 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,29 +1,25 @@ [package] -name = "noise" -version = "0.1.0" -authors = ["Volker Mische "] +name = "noise_search" +version = "0.5.0" +authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" -homepage = "https://github.com/pipedown/noise.git" +homepage = "http:/noisesearch.org" license = "MIT OR Apache-2.0" readme = "README.md" description = "Nested Object Inverted Search Engine" build = "build.rs" +[lib] +name = "noise_search" +crate-type = ["lib"] + [dependencies] -capnp = "0.7.4" -rustc-serialize= "0.3.19" +rustc-serialize = "0.3.19" stemmer = "0.3.2" unicode-normalization = "0.1.2" +unicode-segmentation = "0.1.2" +rocksdb = "0.6.0" +varint = "0.9.0" +uuid = { version = "0.3", features = ["v4"] } -[dependencies.rocksdb] -git = "https://github.com/vmx/rust-rocksdb.git" -branch = "vmx" - -[dependencies.unicode-segmentation] -git = "https://github.com/vmx/unicode-segmentation.git" -branch = "unicode-word-indices" - - -[build-dependencies] -capnpc = "0.7.2" diff --git a/README.md b/README.md index 13f557f..386c1f9 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ nature of JSON, and will allow: * Case sensitive exact word and sentence match * Arbitrary boolean nesting * Greater than/less Than matching + +[Query Langauge Reference here](https://github.com/pipedown/noise/blob/master/query_language_reference.md) Installation @@ -21,6 +23,7 @@ Installation ### Dependencies * [RocksDB](http://rocksdb.org/) + * [capnp-tool](https://capnproto.org/capnp-tool.html) ### Build diff --git a/build.rs b/build.rs index fa6c048..46a4178 100644 --- a/build.rs +++ b/build.rs @@ -1,5 +1,5 @@ -extern crate capnpc; fn main() { - ::capnpc::compile("capnp", &["capnp/records.capnp"]).unwrap(); + println!("cargo:rerun-if-changed=src/"); + println!("cargo:rerun-if-changed=tests/"); } diff --git a/capnp/main.rs b/capnp/main.rs deleted file mode 100644 index e30c81f..0000000 --- a/capnp/main.rs +++ /dev/null @@ -1,11 +0,0 @@ -//extern crate capnp; -// -//pub mod records_capnp { -// include!(concat!(env!("OUT_DIR"), "/records_capnp.rs")); -//} -// -// -//pub mod records { -// use records_capnp::{header}; -// //use capnp::serialize_packed; -//} diff --git a/capnp/records.capnp b/capnp/records.capnp deleted file mode 100644 index caaa5f1..0000000 --- a/capnp/records.capnp +++ /dev/null @@ -1,21 +0,0 @@ -@0x9266127bb5310c6c; - -struct Header { - version @0 :UInt64; - highSeq @1 :UInt64; -} - -struct Payload { - struct ArrayoffsetsToWordinfo { - struct Wordinfo { - stemmedOffset @0 :UInt64; - suffixOffset @1 :UInt64; - suffixText @2 :Text; - } - - arrayoffsets @0 :List(UInt64); - wordinfos @1 :List(Wordinfo); - } - - arrayoffsetsToWordinfos @0 :List(ArrayoffsetsToWordinfo); -} diff --git a/query_language_reference.md b/query_language_reference.md new file mode 100644 index 0000000..e264efe --- /dev/null +++ b/query_language_reference.md @@ -0,0 +1,781 @@ +# Noise Query Language + +The Noise query language is an expressive example-based syntax for finding documents, formatting and returning specific information in the documents, performing relevancy scoring, ordering and aggregations. + +## Find Clause + +All queries have a `find` clause followed by an example based query syntax. It is a combination of expressions that consist of three parts: The key to query, an operator and the value to match. + +This query will return the `_id` of every document with a `{"foo": "bar",...}` + +``` +find {foo: == "bar"} +``` + +This query will match all documents in the index and return their `_id`. + +``` +find {} +``` + +To match on multiple fields, or even nested fields, simply construct the same json structure in query form. + +Match on two fields: + +``` +find {foo: == "bar", fizz: == "buzz"} +``` + +Match on fields, one nested within another: + +``` +find {foo: == "bar", fizz: {fazz: == "buzz"}} +``` + +### Word Match Operator + +`~=` is the full text match operator. Use it find a word in a text field. + +``` +find {body: ~= "word"} +``` + +Put multiple words in the quoted string to find a phrase in the field. + +``` +find {body: ~= "a multi word sentence"} +``` + +To find words that are within a specified distance of each other, put the the maximum word distance in the operator. This example will return results where each word is with the 50 words of the others. + +``` +find {body: ~50= "bitcoin gold price"} +``` + +### Comparison Operators + +Noise supports the following comparison operators: + +|Operator|Description|Types +---------|-----------|----- +|`==`|Equality|Strings, Numbers, true, false, null +|`>`|Less Than|Numbers +|`<`|Greater Than|Numbers +|`>=`|Less Than or Equal|Numbers +|`<=`|Greater Than or Equal|Numbers + +Noise does not do type conversions of datatypes. Strings only compare with strings, number only compare with numbers, etc. + +### Finding Things in Arrays + +Let's say you have document like this with text in an array: + +``` +{"foo": ["bar", "baz"]} +``` + +To find element with value `"baz"` in the array, use syntax like this: + +``` +find {foo:[ == "baz"]} +``` + +If objects are nested in array, like this: + +``` +{"foo": [{"fiz": "bar"}, {"fiz": "baz"}]} +``` + +To find a `{"fiz": "baz"}` in the array, use syntax like this: + +``` +find {foo: [{fiz: == "baz"}]} +``` + +### Boolean Logic and Parens + +Noise has full support for boolean logic using `&&` (logical AND) and `||` (logical OR) operators and nesting logic with parens. + +The comma `,` in objects is actually the same as the `&&` operator. They can be used interchangeably for which ever is more readable. + +Find a doc with `"foo"` or `"bar"` in the `body`: + +``` +find {body: ~= "foo" || body: ~= "bar"} +``` + +Find a doc that has `"foo"` or `"bar"` and has `"baz"` or `"biz"` in the `body`: + +``` +find {(body: ~= "foo" || body: ~= "bar") && + (body: ~= "baz" || body: ~= "biz")} +``` + +The fields can be nested as well. Find a doc where either the nested field `fiz` contains either `"baz"` or `"biz"`. + +``` +find {foo: {fiz: ~= "baz" || fiz: ~= "biz"}} +``` + + +### Not Operator + +Use the `!` (logical NOT) to exclude matching criteria. + +Find docs where `foo` has value `"bar"` and `fab` does not have value `"baz"`: + +``` +find {foo: == "bar", fab: != "baz"} +``` + +You can use logical not with parentheses to negate everything enclosed. This example finds docs where `foo` has value `"bar"` and `fab` does not have value `"baz"` or `"biz"`': + +``` +find {foo: == "bar", !(fab: == "baz" || fab: == "biz")} +``` + +You cannot have every clause be negated as it's a very resource intensive operation. Query need at least one non-negated clauses. + +Illegal: + +``` +find {foo: !~= "bar" && foo: !~= "baz"} +``` + +Illegal: + +``` +find {!(foo: ~= "bar" && foo: ~= "baz"}) +``` + +Also double negation is not allowed. + +Illegal: + +``` +find {foo ~= "waz" && !(foo: ~= "bar" && foo: !~= "baz"}) +``` + +Workarounds for this limitation are: + + - Do a `find {}` and filter out the results in your application + - Add a field that will always match and use it in your condition. Example: + ``` + find {alwaystrue: == true && foo: !~= "bar"} + ``` + +### Relevancy Scoring and Boosting + +Relevancy scoring uses a combination boolean model and Term Frequency/Inverse Document Frequency (TF/IDF) scoring system, very similar to Lucene and Elastic Search. The details of the scoring model is beyond the scope of the document. + +To return results in relevancy score order (most relevant first), simply use the order clause with the `score()` function. + +``` +find {subject: ~= "hammer" || body: ~= "hammer"} +order score() desc +``` + +But if you want matches in `subject` fields to score higher than in `body` fields, you can boost the score with the `^` operator. It is a multiplier of the scores of associated clauses. + +This boosts `subject` matches by 2x: + +``` +find {subject: ~= "hammer"^2 || body: ~= "hammer"} +order score() desc +``` + +You can also boost everything in parenthesis or objects or arrays: + +``` +find {(subject: ~= "hammer" || subject: ~= "nails")^2 || + body: ~= "hammer" || body: ~= "nails"} +order score() desc +``` +Another way to express the same thing: + +``` +find {subject: ~= "hammer" || subject: ~= "nails"}^2 || + {body: ~= "hammer" || body: ~= "nails"} +order score() desc +``` + + +## Order Clause + +To return results in a particular order, use the order clause. + +This will order results ascending based on the contents of the `baz` field: + +``` +find {foo: == "bar"} +order .baz +``` + +If `baz` doesn't existing, `null` be the value used for ordering. + +This will order `baz` descending: + +``` +find {foo: == "bar"} +order .baz +``` + +This will order `baz` ascending: + +``` +find {foo: == "bar"} +order .baz asc +``` + +This will order `baz` ascending with default value of `1` if no `baz` value exists: + +``` +find {foo: == "bar"} +order .baz asc default=1 +``` + +This will order `baz` ascending, for values of `baz` that are the same, those results are now ordered as `biz` ascending. + +``` +find {foo: == "bar"} +order .baz asc, .biz dsc +``` + +## Return Clause + +The return clause is how data or scoring is returned to the client. You can extract the whole document, a single field, multiple fields, and perform aggregations. + +For this section these examples the following document will be used: + +```json +{ + "_id": "example", + "foo": "bar", + "baz": {"biz": "bar"}, + "faz": [ + {"fiz": 213}, + {"biz": 5463}, + {"biz": 73} + ] +} +``` + +### Basic Dot Notation + +A leading dot indicates the root of the document. To return the whole document, place a single dot in return clause. + +This will return the whole document for each document found. + +```Thrift +find + {foo: == "bar"} +return + . +// [{ +// "_id": "example", +// "foo": "bar", +// "baz": {"biz": "bar"}, +// "faz": [ +// {"fiz": 213}, +// {"biz": 5463}, +// {"biz": 73} +// ] +// }] +``` + +To return a specific field, place the field name after the dot: + +```Thrift +find {foo: == "bar"} +return .baz +// [{"biz": "bar"}] +``` + +To return a nested field, use another dot: + +```Thrift +find {foo: == "bar"} +return .baz.biz +// ["bar"] +``` + +To return an array element, use the array notation: + +```Thrift +find {foo: == "bar"} +return .faz[1] +// [{"biz": 5463}] +``` + +To return an object field nested in the array, add a dot after the array notation: + +```Thrift +find {foo: == "bar"} +return .faz[1].biz +// [5463] +``` + +To return multiple values, embed the return paths in other JSON structures. + +For each match this example returns 2 values inside an array: + +```Thrift +find {foo: == "bar"} +return [.baz, .faz] +// [[ +// {"biz": "bar"}, +// [{"fiz": 213}, {"biz": 5463}, {"biz": 73}] +// ]] +``` + +For each match this example return 2 values inside an object: + +```Thrift +find {foo: == "bar"} +return {baz: .baz, faz: .faz} +// [{ +// "baz": {"biz": "bar"}, +// "faz": [{"fiz": 213}, {"biz": 5463}, {"biz": 73}] +// }] +``` + +### Missing Values + +Sometimes you'll want to return a field that doesn't exist on a matching document. When that happens, `null` is returned. + +If you'd like a different value to be returned, use the `default=` option, like this: + +```Thrift +find {foo: == "bar"} +return .hammer default=0 +// [0] +``` + +Each returned value can have a default as well. + +```Thrift +find {foo: == "bar"} +return {baz: .baz default=0, hammer: .hammer default=1} +// [{ +// "baz": {"biz": "bar"}, +// "hammer": 1 +// }] +``` + + + +### Return a Field from All Objects Inside an Array + +If want to return a nested field inside an array, but for each object in the array, use the `[]` with no index. + +This will return each biz field as an array of values: + +```Thrift +find {foo: == "bar"} +return .faz[].biz +// [[5463, 73]] +``` + +### Bind Variables: Return Only Matched Array Elements + +If you are searching for nested values or objects nested in arrays, and you want to return only the match objects, use the bind syntax before the array in the query. The bound value is always an array, as multiple elements might match. + +Say you have a document like this: + +```json +{ + "_id": "a", + "foo": [ + {"fiz": "bar", "val": 4}, {"fiz": "baz", "val": 7} + ], + "bar": [ + {"fiz": "baz", "val": 9} + ] +} + +``` + +You want to return the object where `{"fiz": "bar", ...}` (but not the others), use you a bind variable (`var::[...]`), like this: + +```Thrift +find {foo: x::[{fiz: == "bar"}]} +return x +// [[{"fiz": "bar", "val": 4}]] +``` + +If instead you want to return the `val` field, add the `.val` to the bind variable like this: + +```Thrift +find {foo: x::[{fiz: == "bar"}]} +return x.val +// [[4]] +``` + +You can have any number of bind variables: + +```Thrift +find {foo: x::[{fiz: == "bar"}], foo: y::[{fiz: == "baz"}]} +return [x.val, y.val] +// [[[4], [7]]] +``` + +The same query as the previous one, but returning an object: + +```Thrift +find {foo: x::[{fiz: == "bar"}], foo: y::[{fiz: == "baz"}]} +return {x: x.val, y: y.val} +// [{"x": [4], "y": [7]}] +``` + +You can reuse bind variables in different clauses and they'll be combined: + +```Thrift +find {foo: x::[{fiz: == "baz"}] || bar: x::[{fiz: == "baz"}]} +return {x: x.val} +// [{"x": [7, 9]}] +``` + +## Limit Clause + +To limit the number of results, use a limit clause at the end of the query. + +This limits the results to the first 10 found: + +```Thrift +find {foo: == "bar"} +return .baz +limit 10 +``` + + +## Grouping and Aggregation + +Noise includes ways to group rows together and aggregate values. + +Values you want to group together use `group(...)` function in the `return` clause. + +For values that are grouped together you can then perform aggregations on other values and return that aggregation. If a group function is used, all other fields must also be grouped or aggregated. + +The aggregation functions available are: + +|function | Description| +---------------|------------- +|`array(...)`|Returns all values in the group as values in an array.| +|`array_flat(...)`|Returns all values in the group as values in an array. However if an array is encountered it extracts all the values inside the array (and further nested arrays) and returns them as a singe flat array| +|`avg(...)`|Averages numeric values in the group. If numeric values are in arrays, it extracts the values from the arrays. Even if arrays are nested in arrays, it extracts through all levels of nested arrays and averages them. | +|`count()`| Returns the count of the grouped rows for each grouping. | +|`concat(... [sep="..."])`| Returns all the strings in the group as a single concatenated string. Other value types are ignored. Use the optional `sep="..."` to specify a separator between string values.| +|`max(...)`|Returns the maximum value in the group. See type ordering below to see how different types are considered. | +|`max_array(...)`|Returns the maximum value in the group, if array is encountered the values inside the array are extracted and considered.| +|`min(...)`|Returns the minimum value in the group. See type ordering below to see how different types are considered.| +|`min_array(...)`|Returns the minimum value in the group, if array is encountered the values inside the array are extracted and considered.| +|`sum(...)`|Sums numeric values in the group. If numeric values are in arrays, it extracts the values from the arrays. Even if arrays are nested in arrays, it extracts through all levels of nested arrays and sums them.| + +To perform grouping and/or aggregate, each field returned will need either a grouping or a aggregate function. It's an error it on some returned fields but not others. + +Groupings are are ordered first on the leftmost `group(...)` function, then on the next leftmost, etc. + +You do not need to use `group(...)` to perform aggregates. If you have no `group(...)` defined, then all rows are aggregated into a single row. + + + +### Max/Min Type Ordering +The ordering of types for `max(...)` and `min(...)` is as follows: + +null < false < true < number < string < array < object + + +## Group/Aggregate Examples + + +Let's say we have documents like this: + +```json +{"foo":"group1", "baz": "a", "bar": 1} +{"foo":"group1", "baz": "b", "bar": 2} +{"foo":"group1", "baz": "c", "bar": 3} +{"foo":"group1", "baz": "a", "bar": 1} +{"foo":"group1", "baz": "b", "bar": 2} +{"foo":"group1", "baz": "c", "bar": 3} +{"foo":"group1", "baz": "a", "bar": 1} +{"foo":"group1", "baz": "b", "bar": 2} +{"foo":"group1", "baz": "c", "bar": 3} +{"foo":"group1", "baz": "a", "bar": 1} +{"foo":"group1", "baz": "b", "bar": 2} +{"foo":"group1", "baz": "c", "bar": 3} +{"foo":"group2", "baz": "a", "bar": "a"} +{"foo":"group2", "baz": "a", "bar": "b"} +{"foo":"group2", "baz": "b", "bar": "a"} +{"foo":"group2", "baz": "b", "bar": "b"} +{"foo":"group2", "baz": "a", "bar": "a"} +{"foo":"group2", "baz": "a", "bar": "c"} +{"foo":"group2", "baz": "b", "bar": "d"} +{"foo":"group2", "baz": "b", "bar": "e"} +{"foo":"group2", "baz": "a", "bar": "f"} +{"foo":"group3", "baz": "a", "bar": "a"} +("foo":"group3", "bar": "b"} +{"foo":"group3", "baz": "b", "bar": "a"} +{"foo":"group3", "baz": "b", "bar": "b"} +{"foo":"group3", "baz": "a", "bar": "a"} +{"foo":"group3", "baz": "a" } +{"foo":"group3", "baz": "b", "bar": "d"} +{"foo":"group3", "baz": "b", "bar": "e"} +{"foo":"group3", "baz": "a", "bar": "f"} +``` + +### Count + +Query: +``` +find {foo: == "group1"} +return {baz: group(.baz), count: count()} +``` +Results: + +```json +{"baz":"a","bar":4} +{"baz":"b","bar":4} +{"baz":"c","bar":4} + +``` + +### Sum + +Query: + +``` +find {foo: == "group1"} +return {baz: group(.baz), bar: sum(.bar)} +``` + +Results: + +```json +{"baz":"a","bar":4} +{"baz":"b","bar":8} +{"baz":"c","bar":12} + +``` + +### Avg + +Query: + +``` +find {foo: == "group1"} +return {avg: avg(.bar)} +``` + +Results: + +```json +{"bar":2} +``` + +### Concat + +Query: + +``` +find {foo: == "group1"} +return {baz: group(.baz), concat: concat(.baz sep="|")} +``` + +Results: + +```json +{"baz":"a","concat":"a|a|a|a"} +{"baz":"b","concat":"b|b|b|b"} +{"baz":"c","concat":"c|c|c|c"} +``` + +### Max + +Query: + +``` +find {foo: == "group1"} +return {max: max(.bar)} +``` +Results: + +```json +{"max":3} +``` + +Query: + +``` +find {foo: == "group1"} +return {max: max(.baz)} +``` + +Results: + +```json +{"max":"c"} +``` + +### Min + +Query: + +``` +find {foo: == "group1"} +return {min: min(.bar)} +``` + +Results: + +```json +{"min":1} +``` + +### Group Ordering + +Query: + +``` +find {foo: == "group2"} +return [group(.baz order=asc), group(.bar order=desc), count()] +``` + +Results: + +```json +["a","f",1] +["a","c",1] +["a","b",1] +["a","a",2] +["b","e",1] +["b","d",1] +["b","b",1] +["b","a",1] +``` + +### Default Values + +Query: + +``` +find {foo: =="group2"} +return [group(.baz order=asc) default="a", group(.bar order=desc) default="c", count()]; +``` + +Results: + +```json +["a","f",1] +["a","c",1] +["a","b",1] +["a","a",2] +["b","e",1] +["b","d",1] +["b","b",1] +["b","a",1] +``` + +### Arrays + +When performing aggregations on arrays, some functions will extract values out of the arrays (and arrays nested in arrays). + +We have documents like this: + +```json +{"foo":"array1", "baz": ["a","b",["c","d",["e"]]]} +{"foo":"array1", "baz": ["f","g",["h","i"],"j"]} +{"foo":"array2", "baz": [1,2,[3,4,[5]]]} +{"foo":"array2", "baz": [6,7,[8,9],10]}; +``` + +Query: + +``` +find {foo: == "array1"} +return array(.baz) +``` + +Results: + +```json +[["f","g",["h","i"],"j"],["a","b",["c","d",["e"]]]] +``` + +Query: + +``` +find {foo: == "array1"} +return array_flat(.baz) +``` + +Results: + +```json +["f","g","h","i","j","a","b","c","d","e"] +``` + +Query: + +``` +find {foo: == "array1"} +return max(.baz) +``` + +Results: + +```json +["f","g",["h","i"],"j"] +``` + +Query: + +``` +find {foo: == "array1"} +return max_array(.baz) +``` + +Results: + +```json +"j" +``` + +Query: + +``` +find {foo: == "array1"} +return min_array(.baz) +``` + +Results: + +```json +"a" +``` + +Query: + +``` +find {foo: =="array2"} +return avg(.baz) +``` + +Results: + +```json +5.5 +``` + +Query: + +``` + +find {foo: =="array2"} +return sum(.baz) +``` + +Results: + +```json +55 +``` diff --git a/repl-tests/bind_var.noise b/repl-tests/bind_var.noise new file mode 100644 index 0000000..59e970f --- /dev/null +++ b/repl-tests/bind_var.noise @@ -0,0 +1,60 @@ +# Bind Variables testing. Feature is not complete + +drop target/tests/querytestbindvar; +create target/tests/querytestbindvar; + +add {"_id":"1", "bar": [{"a":"foo","v":1},{"a":"bar","v":2}]}; +"1" + + +find {bar: x::[{a: =="foo"}]} +return x ; +[ +[{"a":"foo","v":1}] +] + +find {bar: x::[{a: =="foo"}]} +return x.v ; +[ +[1] +] + +find {bar: x::[{a: =="foo" || a: =="bar"}]} +return x.v ; +[ +[1,2] +] + +find {bar: x::[{a: =="foo" || a: =="baz"}]} +return x.v ; +[ +[1] +] + +find {bar: x::[{a: =="foof" || a: =="bar"}]} +return x.v ; +[ +[2] +] + +find {bar: x::[{a: =="foo"}] || bar: x::[{a: =="bar"}]} +return x.v ; +[ +[1,2] +] + +find {bar: x::[{a: =="foo"}] || bar: y::[{a: =="bar"}]} +return [x.v, y.v] ; +[ +[[1],[2]] +] + +find {bar: x::[{a: =="foo"}] || bar: y::[{a: =="baz"}]} +return [x.v, y.v default=0] ; +[ +[[1],[0]] +] + +find {bar: x::[{a: =="foo"}] && bar: y::[{a: =="baz"}]} +return [x.v, y.v] ; +[] diff --git a/repl-tests/collation.noise b/repl-tests/collation.noise new file mode 100644 index 0000000..1488f4c --- /dev/null +++ b/repl-tests/collation.noise @@ -0,0 +1,141 @@ +# order expressions. + +drop target/tests/querytestjsoncollation; +create target/tests/querytestjsoncollation; + +add {"_id":"1", "foo":"coll", "bar": {}}; +"1" +add {"_id":"2", "foo":"coll", "bar": {"foo":"bar"}}; +"2" +add {"_id":"3", "foo":"coll", "bar": {"foo":"baz"}}; +"3" +add {"_id":"4", "foo":"coll", "bar": {"foo":"baz","bar":"baz"}}; +"4" +add {"_id":"5", "foo":"coll", "bar": {"foo":"baz","bar":"bar"}}; +"5" +add {"_id":"6", "foo":"coll", "bar": 1}; +"6" +add {"_id":"7", "foo":"coll", "bar": 1.00001}; +"7" +add {"_id":"8", "foo":"coll", "bar": 2.00001}; +"8" +add {"_id":"9", "foo":"coll", "bar": true}; +"9" +add {"_id":"10", "foo":"coll", "bar": false}; +"10" +add {"_id":"11", "foo":"coll", "bar": null}; +"11" +add {"_id":"12", "foo":"coll", "bar": []}; +"12" +add {"_id":"13", "foo":"coll", "bar": [true]}; +"13" +add {"_id":"14", "foo":"coll", "bar": [null]}; +"14" +add {"_id":"15", "foo":"coll", "bar": "string"}; +"15" +add {"_id":"16", "foo":"coll", "bar": "string2"}; +"16" +add {"_id":"17", "foo":"coll", "bar": "string3"}; +"17" + +find {foo: =="coll"} +order .bar asc +return .bar ; +[ +null, +false, +true, +1, +1.00001, +2.00001, +"string", +"string2", +"string3", +[], +[null], +[true], +{}, +{"bar":"bar","foo":"baz"}, +{"bar":"baz","foo":"baz"}, +{"foo":"bar"}, +{"foo":"baz"} +] + +find {foo: =="coll"} +order .bar asc +return .bar +limit 5; +[ +null, +false, +true, +1, +1.00001 +] + +find {foo: =="coll"} +order .bar asc +return .bar +limit 1; +[ +null +] + +add {"_id":"20", "foo":"coll2", "bar":[1,1,1]}; +"20" +add {"_id":"21", "foo":"coll2", "bar":[1,1,2]}; +"21" +add {"_id":"22", "foo":"coll2", "bar":[1,2,2]}; +"22" +add {"_id":"23", "foo":"coll2", "bar":[2,2,2]}; +"23" +add {"_id":"24", "foo":"coll2", "bar":[2,1,1]}; +"24" +add {"_id":"25", "foo":"coll2", "bar":[2,1,2]}; +"25" +add {"_id":"26", "foo":"coll2", "bar":[2,3,2]}; +"26" +add {"_id":"27", "foo":"coll2", "bar":[3,4,3]}; +"27" +add {"_id":"28", "foo":"coll2", "bar":[5,4,3]}; +"28" +add {"_id":"29", "foo":"coll2", "bar":[5,5,5]}; +"29" + +find {foo: =="coll2"} +order .bar[0] asc, .bar[1] desc, .bar[2] desc +return [.bar[0], .bar[1], .bar[2]] ; +[ +[1,2,2], +[1,1,2], +[1,1,1], +[2,3,2], +[2,2,2], +[2,1,2], +[2,1,1], +[3,4,3], +[5,5,5], +[5,4,3] +] + +find {foo: =="coll2"} +order .bar[0] asc, .bar[1] desc, .bar[2] desc +return [.bar[2], .bar[1], .bar[0]] ; +[ +[2,2,1], +[2,1,1], +[1,1,1], +[2,3,2], +[2,2,2], +[2,1,2], +[1,1,2], +[3,4,3], +[5,5,5], +[3,4,5] +] + +find {foo: =="group2"} +order .baz asc, .bar desc +return [.baz, .bar] +limit 2; +[] diff --git a/repl-tests/deletion_updates.noise b/repl-tests/deletion_updates.noise new file mode 100644 index 0000000..94a3a78 --- /dev/null +++ b/repl-tests/deletion_updates.noise @@ -0,0 +1,53 @@ +# update and deletion tests + +# add before opening +add {"_id":"1", "A":[{"B":"B2","C":"C2"},{"B": "b1","C":"C2"}]}; +Write error: Index isn't open. + +drop target/tests/updatedeletion; +create target/tests/updatedeletion; + +add {"_id":"1", "A":[{"B":"B2","C":"C2"},{"B": "b1","C":"C2"}]}; +"1" +add {"_id":"2", "A":[{"B":"B2","C":[{"D":"D"}]},{"B": "b1","C":"C2"}]}; +"2" +add {"_id":"3", "A":"Multi word sentence"}; +"3" +add {"_id":"4", "A":"%&%}{}@);€"}; +"4" +add {"_id":"5", "A":"word"}; +"5" + +# delete before committing +del 5; +Write error: Attempt to delete doc with same _id added earlier + +# delete what doesn't exist +del 6; +not found + +commit; + +del 5; +ok + +add {"_id":"5", "A":"word"}; +Write error: Attempt to insert multiple docs with same _id + +# add again without committing +add {"_id":"5", "A":"word"}; +Write error: Attempt to insert multiple docs with same _id + +commit; + +# add existing document +add {"_id":"5", "A":"wassup!"}; +"5" + +find {A: == "wassup!"}; +[ +"5" +] + +find {A: == "word"}; +[] diff --git a/repl-tests/group.noise b/repl-tests/group.noise new file mode 100644 index 0000000..57a4a05 --- /dev/null +++ b/repl-tests/group.noise @@ -0,0 +1,248 @@ +# Group and aggregate tests + +drop target/tests/querytestgroup; +create target/tests/querytestgroup; + +add {"_id":"1", "foo":"group", "baz": "a", "bar": 1}; +"1" +add {"_id":"2", "foo":"group", "baz": "b", "bar": 2}; +"2" +add {"_id":"3", "foo":"group", "baz": "c", "bar": 3}; +"3" +add {"_id":"4", "foo":"group", "baz": "a", "bar": 1}; +"4" +add {"_id":"5", "foo":"group", "baz": "b", "bar": 2}; +"5" +add {"_id":"6", "foo":"group", "baz": "c", "bar": 3}; +"6" +add {"_id":"7", "foo":"group", "baz": "a", "bar": 1}; +"7" +add {"_id":"8", "foo":"group", "baz": "b", "bar": 2}; +"8" +add {"_id":"9", "foo":"group", "baz": "c", "bar": 3}; +"9" +add {"_id":"10", "foo":"group", "baz": "a", "bar": 1}; +"10" +add {"_id":"11", "foo":"group", "baz": "b", "bar": 2}; +"11" +add {"_id":"12", "foo":"group", "baz": "c", "bar": 3}; +"12" + +find {foo: =="group"} +return {baz: group(.baz), bar: sum(.bar)}; +[ +{"baz":"a","bar":4}, +{"baz":"b","bar":8}, +{"baz":"c","bar":12} +] + +find {foo: =="group"} +return {bar: sum(.bar)}; +[ +{"bar":24} +] + +find {foo: =="group"} +return {bar: avg(.bar)}; +[ +{"bar":2} +] + +find {foo: =="group"} +return {baz: group(.baz), concat: concat(.baz sep="|")}; +[ +{"baz":"a","concat":"a|a|a|a"}, +{"baz":"b","concat":"b|b|b|b"}, +{"baz":"c","concat":"c|c|c|c"} +] + +find {foo: =="group"} +return {baz: group(.baz), array: array(.baz)}; +[ +{"baz":"a","array":["a","a","a","a"]}, +{"baz":"b","array":["b","b","b","b"]}, +{"baz":"c","array":["c","c","c","c"]} +] + +find {foo: =="group"} +return {baz: group(.baz), count: count()}; +[ +{"baz":"a","count":4}, +{"baz":"b","count":4}, +{"baz":"c","count":4} +] + +find {foo: =="group"} +return {max: max(.bar)}; +[ +{"max":3} +] + +find {foo: =="group"} +return {min: min(.bar)}; +[ +{"min":1} +] + +find {foo: =="group"} +return {max: max(.baz)}; +[ +{"max":"c"} +] + +add {"_id":"10", "foo":"group2", "baz": "a", "bar": "a"}; +"10" +add {"_id":"11", "foo":"group2", "baz": "a", "bar": "b"}; +"11" +add {"_id":"12", "foo":"group2", "baz": "b", "bar": "a"}; +"12" +add {"_id":"13", "foo":"group2", "baz": "b", "bar": "b"}; +"13" +add {"_id":"14", "foo":"group2", "baz": "a", "bar": "a"}; +"14" +add {"_id":"15", "foo":"group2", "baz": "a", "bar": "c"}; +"15" +add {"_id":"16", "foo":"group2", "baz": "b", "bar": "d"}; +"16" +add {"_id":"17", "foo":"group2", "baz": "b", "bar": "e"}; +"17" +add {"_id":"18", "foo":"group2", "baz": "a", "bar": "f"}; +"18" + +find {foo: =="group2"} +return [group(.baz order=asc), group(.bar order=desc), count()]; +[ +["a","f",1], +["a","c",1], +["a","b",1], +["a","a",2], +["b","e",1], +["b","d",1], +["b","b",1], +["b","a",1] +] + +find {foo: =="group2"} +return [group(.baz order=asc), group(.bar order=desc), count()] +limit 2; +[ +["a","f",1], +["a","c",1] +] + +add {"_id":"1", "foo":"group3", "baz": "a", "bar": "a"}; +"1" +add {"_id":"2", "foo":"group3", "bar": "b"}; +"2" +add {"_id":"3", "foo":"group3", "baz": "b", "bar": "a"}; +"3" +add {"_id":"4", "foo":"group3", "baz": "b", "bar": "b"}; +"4" +add {"_id":"5", "foo":"group3", "baz": "a", "bar": "a"}; +"5" +add {"_id":"6", "foo":"group3", "baz": "a" }; +"6" +add {"_id":"7", "foo":"group3", "baz": "b", "bar": "d"}; +"7" +add {"_id":"8", "foo":"group3", "baz": "b", "bar": "e"}; +"8" +add {"_id":"9", "foo":"group3", "baz": "a", "bar": "f"}; +"9" + +find {foo: =="group2"} +return [group(.baz order=asc) default="a", group(.bar order=desc) default="c", count()]; +[ +["a","f",1], +["a","c",1], +["a","b",1], +["a","a",2], +["b","e",1], +["b","d",1], +["b","b",1], +["b","a",1] +] + +add {"_id":"1", "foo":"array", "baz": ["a","b",["c","d",["e"]]]}; +"1" +add {"_id":"2", "foo":"array", "baz": ["f","g",["h","i"],"j"]}; +"2" + +find {foo: =="array"} +return array(.baz); +[ +[["f","g",["h","i"],"j"],["a","b",["c","d",["e"]]]] +] + +find {foo: =="array"} +return array_flat(.baz); +[ +["f","g","h","i","j","a","b","c","d","e"] +] + +find {foo: =="array"} +return max(.baz); +[ +["f","g",["h","i"],"j"] +] + +find {foo: =="array"} +return max_array(.baz); +[ +"j" +] + +find {foo: =="array"} +return min_array(.baz); +[ +"a" +] + +add {"_id":"1", "foo":"array", "baz": [1,2,[3,4,[5]]]}; +"1" +add {"_id":"2", "foo":"array", "baz": [6,7,[8,9],10]}; +"2" + + +find {foo: =="array"} +return avg(.baz); +[ +5.5 +] + +find {foo: =="array"} +return sum(.baz); +[ +55 +] + +add {"_id":"1", "foo":"array", "baz": []}; +"1" +add {"_id":"2", "foo":"array", "baz": []}; +"2" + +commit; + +find {foo: =="array"} +return avg(.baz); +[ +null +] + +find {foo: =="array"} +return sum(.baz); +[ +0 +] + +find {foo: =="array"} +return min_array(.baz); +[ +[] +] + +find {foo: =="array"} +return max_array(.baz); +[ +[] +] + diff --git a/repl-tests/limit.noise b/repl-tests/limit.noise new file mode 100644 index 0000000..15c7b66 --- /dev/null +++ b/repl-tests/limit.noise @@ -0,0 +1,132 @@ +# limit clause tests + +drop target/tests/querytestlimit; +create target/tests/querytestlimit; + + +add {"_id":"1", "A": 6}; +"1" +add {"_id":"2", "A": 6}; +"2" +add {"_id":"3", "A": 4}; +"3" +add {"_id":"4", "A": 4}; +"4" +add {"_id":"5", "A": 1}; +"5" + +# "limit" tests with find clause only + +find {A: >= 1}; +[ +"1", +"2", +"3", +"4", +"5" +] + +find {A: >= 1} +limit 1; +[ +"1" +] + +find {A: >= 1} +limit 3; +[ +"1", +"2", +"3" +] + +find {A: < 5}; +[ +"3", +"4", +"5" +] + +find {A: < 5} +limit 2; +[ +"3", +"4" +] + +# "limit" tests with ordering + +find {A: > 3} +order .A; +[ +"4", +"3", +"2", +"1" +] + +find {A: > 3} +order .A +limit 1; +[ +"3" +] + +# "limit" tests with return + +find {A: >= 1} +return .; +[ +{"A":6,"_id":"1"}, +{"A":6,"_id":"2"}, +{"A":4,"_id":"3"}, +{"A":4,"_id":"4"}, +{"A":1,"_id":"5"} +] + +find {A: >= 1} +return . +limit 1; +[ +{"A":6,"_id":"1"} +] + +find {A: >= 1} +return .A; +[ +6, +6, +4, +4, +1 +] + +find {A: >= 1} +return .A +limit 1; +[ +6 +] + +# "limit" tests with return and ordering + +find {A: >= 1} +order .A +return .A; +[ +1, +4, +4, +6, +6 +] + +find {A: >= 1} +order .A +return .A +limit 3; +[ +1, +4, +4 +] diff --git a/repl-tests/not.noise b/repl-tests/not.noise new file mode 100644 index 0000000..3569a82 --- /dev/null +++ b/repl-tests/not.noise @@ -0,0 +1,168 @@ +# Logical not tests + +drop target/tests/querytestnot; +create target/tests/querytestnot; + + +add {"_id":"1", "alwaystrue": true, "bar": "fox"}; +"1" +add {"_id":"2", "alwaystrue": true, "bar": "quick fox"}; +"2" +add {"_id":"3", "alwaystrue": true, "bar": "quick brown fox"}; +"3" +add {"_id":"4", "alwaystrue": true, "bar": ["fox"]}; +"4" +add {"_id":"5", "alwaystrue": true, "bar": ["quick fox"]}; +"5" +add {"_id":"6", "alwaystrue": true, "bar": ["quick brown fox"]}; +"6" +add {"_id":"7", "alwaystrue": true, "baz": ["fox"]}; +"7" +add {"_id":"8", "alwaystrue": true, "baz": ["quick","fox"]}; +"8" +add {"_id":"9", "alwaystrue": true, "baz": ["quick","brown","fox"]}; +"9" +add {"_id":"10", "alwaystrue": true, "baz": [["quick"],["brown"],["fox"]]}; +"10" +add {"_id":"11", "alwaystrue": true, "baz": [["brown"],["fox"]]}; +"11" +add {"_id":"12", "alwaystrue": true, "baz": [["fox"]]}; +"12" + +find {(bar: ~="fox" || bar: ~="brown") && (bar: !~="quick")} +return ._id ; +[ +"1" +] + +find {(bar: ~="fox" || bar: ~="brown") && !(bar: ~="quick")} +return ._id ; +[ +"1" +] + +find {bar: ~="fox" || bar: ~="brown"} && !{bar: ~="quick"} +return ._id ; +[ +"1" +] + +find {bar: [(~="fox" || ~="brown") && !~="quick"]} +return ._id ; +[ +"4" +] + +find {bar: [(~="fox" || ~="brown") && !(~="quick")]} +return ._id ; +[ +"4" +] + +find {bar: [~="fox" || ~="brown"] && bar: ![~="quick"]} +return ._id ; +[ +"4" +] + +find {baz: [(~="fox" || ~="brown") && !~="quick"]} +return ._id ; +[ +"7", +"8", +"9" +] + +find {baz: [(~="fox" || ~="brown") && !(~="quick")]} +return ._id ; +[ +"7", +"8", +"9" +] + +find {baz: [~="fox" || ~="brown"] && baz: ![~="quick"]} +return ._id ; +[ +"7" +] + +find {baz: [~="fox" || ~="brown"] && baz: [!~="fox"]} +return ._id ; +[ +"8", +"9" +] + +find {baz: [~="fox" || ~="brown"] && baz: [!="fox"]} +return ._id ; +[ +"8", +"9" +] + +# not a field that doesn't exist. +find {baz: [~="fox" || ~="brown"] && missing: ![~="fox"]} +return ._id ; +[ +"7", +"8", +"9" +] + +find {baz: [[~="brown"]] || baz: [[!~="fox"]]} +return ._id ; +[ +"1", +"2", +"3", +"4", +"5", +"6", +"7", +"8", +"9", +"10", +"11" +] + +find {baz: [[~="brown"]] && baz: [[!~="fox"]]} +return ._id ; +[ +"10", +"11" +] + +find {_id: == "12" && baz: [[!="fox"]]} +return ._id ; +[] + +# Test for unallowable expressions + +find !{baz: [~="fox"]} +return ._id ; +Parse error: query cannot be made up of only logical not. Must have at least one match clause not negated. + +find {!(bar: ~="quick" || bar: [~="quick"] || baz: [~="quick"] || baz: [[~="quick"]])} +return ._id ; +Parse error: query cannot be made up of only logical not. Must have at least one match clause not negated. + +find !{baz: ~="fox"} && !{baz: =="foo"} +return ._id ; +Parse error: Logical not ("!") is nested inside of another logical not. This is not allowed. + +find {foo: =="bar"} && !{baz: !~="fox"} +return ._id ; +Parse error: Logical not ("!") is nested inside of another logical not. This is not allowed. + +# Workaround for unallowable expressions + +find {!(bar: ~="quick" || bar: [~="quick"] || baz: [~="quick"] || baz: [[~="quick"]]), alwaystrue: == true} +return ._id ; +[ +"1", +"4", +"7", +"11", +"12" +] diff --git a/repl-tests/query_basic.noise b/repl-tests/query_basic.noise new file mode 100644 index 0000000..39106f5 --- /dev/null +++ b/repl-tests/query_basic.noise @@ -0,0 +1,336 @@ +# Some basic tests + +drop target/tests/querytestdbbasic1; +create target/tests/querytestdbbasic1; + + +add {"_id":"1", "A":[{"B":"B2","C":"C2"},{"B": "b1","C":"C2"}]}; +"1" +add {"_id":"2", "A":[{"B":"B2","C":[{"D":"D"}]},{"B": "b1","C":"C2"}]}; +"2" +add {"_id":"3", "A":"Multi word sentence"}; +"3" +add {"_id":"4", "A":"%&%}{}@);€"}; +"4" +add {"_id":"5", "A":"{}€52 deeply \\n\\v "}; +"5" +add {"_id":"6", "A":[{"B":"B3"},{"B": "B3"}]}; +"6" +add {"_id":"7", "A":[{"B":"B3"},{"B": "B4"}]}; +"7" +add {"_id":"8", "A":["A1", "A1"]}; +"8" +add {"_id":"9", "A":["A1", "A2"]}; +"9" +add {"_id":"10", "A":"a bunch of words in this sentence"}; +"10" +add {"_id":"11", "A":""}; +"11" +add {"_id":"12", "A":["1","2","3","4","5","6","7","8","9","10","11","12"]}; +"12" +add {"_id":"13", "A":["foo",1,true,false,null,{},[]]}; +"13" +add {"_id":"14", "A":{"B":true}}; +"14" + + +# Exact match object fields in arrays + +find {A:[{B: =="B2", C: [{D: =="D"} ]}]}; +[ +"2" +] + +find {A:[{B: == "B2", C: == "C2"}]}; +[ +"1" +] + +find {A:[{B: == "B2", C: == "C8"}]}; +[] + +find {A:[{B: == "b1", C: == "C2"}]}; +[ +"1", +"2" +] + +# exact match stuff in fields + +find {A: == "Multi word sentence"}; +[ +"3" +] + +find {A: == "%&%}{}@);€"}; +[ +"4" +] + +find {A: == "{}€52 deeply \\n\\v "}; +[ +"5" +] + +find {A:[{C: == "C2"}]}; +[ +"1", +"2" +] + +find {A:[{B: == "B3" || B: == "B4"}]}; +[ +"6", +"7" +] + +# exact match strings in arrays + + + +find {A:[ == "A1" || == "A2"]}; +[ +"8", +"9" +] + +find {A:[ == "A1" && == "A" || == "A1"]}; +[ +"8", +"9" +] + +find {A:[=="A" || == "A1" && == "A"]}; +[] + +# full text search fields + + + +find {A: ~= "Multi"}; +[ +"3" +] + +# phrase match + +find {A: ~= "multi word"}; +[ +"3" +] + +find {A: ~= "word sentence"}; +[ +"3" +] + +find {A: ~= "sentence word"}; +[] + +# proximity match. Number indicates how many word away terms can be. + +find {A: ~1= "multi sentence"}; +[ +"3" +] + +find {A: ~4= "a sentence"}; +[] + +find {A: ~5= "a sentence"}; +[ +"10" +] + +find {A: ~4= "a bunch of words sentence"}; +[] + +find {A: ~5= "a bunch of words sentence"}; +[ +"10" +] + +find {A: ~10= "a bunch of words sentence"}; +[ +"10" +] + +find {A: == ""}; +[ +"11" +] + +# test return json elements + +find {A:[ == "1"]} +return .A ; +[ +["1","2","3","4","5","6","7","8","9","10","11","12"] +] + +find {A:[ == "2"]} +return .A[0] ; +[ +"1" +] + +find {A:[ == "2"]} +return [.A[0], ._id] ; +[ +["1","12"] +] + +find {A:[ == "2"]} +return {foo:.A[0], bar: ._id} ; +[ +{"foo":"1","bar":"12"} +] + +find {A:[ == "foo"]} +return .A ; +[ +["foo",1,true,false,null,{},[]] +] + +# returning null when missing + +find {A:[ == "foo"]} +return .B ; +[ +null +] + +# returning default values when missing + +find {A:[ == "foo"]} +return .B default={foo:"foo"}; +[ +{"foo":"foo"} +] + +find {A:[ == "foo"]} +return .B default={}; +[ +{} +] + +find {A:[ == "foo"]} +return {foo: .B default={bar:"bar"}}; +[ +{"foo":{"bar":"bar"}} +] + +find {A:[ == "foo"]} +return .B default=0; +[ +0 +] + +find {A:[ == "foo"]} +return .B default=1; +[ +1 +] + +find {A:[ == "foo"]} +return .B default=-1; +[ +-1 +] + +# return every kind of element + +find {A:[ == "foo"]} +return {"a":"a","b":1.123,"true":true,"false":false,"null":null,array:[],object:{}}; +[ +{"a":"a","b":1.123,"true":true,"false":false,"null":null,"array":[],"object":{}} +] + +find {_id: =="14"} return .; +[ +{"A":{"B":true},"_id":"14"} +] + +# return everying in deeply nested arrays + +add {"_id":"15", "a":[{"b":[{"c":1},{"c":2},{"c":3}]},{"b":[{"c":4},{"c":5},{"c":6}]}]}; +"15" + +find {"_id": =="15"} +return .a[].b[].c; +[ +[[1,2,3],[4,5,6]] +] + +# check what happens when only some key paths exist + +add {"_id":"16", "type": "nested", "a":[{"b":[{"b":1},{"c":2},{"b":3}]},{"b":[{"c":4},{"c":5},{"c":6}]}]}; +"16" +find {"_id": =="16"} +return .a[].b[].c; +[ +[[2],[4,5,6]] +] + +# prefix bug + +add {"_id":"1", "prefix": true, "pre": "foo"}; +"1" + +find {prefix: == true} +return .pre; +[ +"foo" +] + +find {prefix: == true} +return .pref; +[ +null +] + +# all docs + +find {}; +[ +"1", +"10", +"11", +"12", +"13", +"14", +"15", +"16", +"2", +"3", +"4", +"5", +"6", +"7", +"8", +"9" +] + +find {} +order score() +return [._id, score()]; +[ +["9",1], +["8",1], +["7",1], +["6",1], +["5",1], +["4",1], +["3",1], +["2",1], +["16",1], +["15",1], +["14",1], +["13",1], +["12",1], +["11",1], +["10",1], +["1",1] +] + + diff --git a/repl-tests/ranges.noise b/repl-tests/ranges.noise new file mode 100644 index 0000000..ba75e81 --- /dev/null +++ b/repl-tests/ranges.noise @@ -0,0 +1,260 @@ +# Test for less and greater than + +drop target/tests/querytestranges; +create target/tests/querytestranges; + + +add {"_id":"one", "A":12}; +"one" +add {"_id":"two", "A":12}; +"two" +add {"_id":"three", "numberarray": [30, 60, 90]}; +"three" +add {"_id":"four", "A":-3}; +"four" +add {"_id":"five", "A":35}; +"five" +add {"_id":"six", "A":true}; +"six" +add {"_id":"seven", "A":false}; +"seven" +add {"_id":"eight", "A":null}; +"eight" +add {"_id":"nine", "boolarray":[true, true]}; +"nine" +add {"_id":"ten", "boolarray":[false, true]}; +"ten" + + +# Exact match number + +find {A: ==12}; +[ +"one", +"two" +] + +find {numberarray: [==60]}; +[ +"three" +] + +del one; +ok +find {A: ==12}; +[ +"two" +] + + +# Greater than (equal) on number + +find {A: >20}; +[ +"five" +] + +find {A: >-5}; +[ +"two", +"four", +"five" +] + +find {numberarray: [>40]}; +[ +"three" +] + +find {A: >35}; +[] + +find {A: >=35}; +[ +"five" +] + + +# Less than (equal) on number + +find {A: <-1}; +[ +"four" +] + +find {A: <20}; +[ +"two", +"four" +] + +find {numberarray: [<70]}; +[ +"three" +] + +find {A: <-3}; +[] + +find {A: <=-3}; +[ +"four" +] + + +# Range on number + +find {A: >10, A: <20}; +[ +"two" +] + +find {A: >-10, A: <20}; +[ +"two", +"four" +] + + +# Exact match boolean + +find {A: ==true}; +[ +"six" +] + +find {A: ==false}; +[ +"seven" +] + +find {A: ==null}; +[ +"eight" +] + +find {boolarray: [==true]}; +[ +"nine", +"ten" +] + +# scoring + +find {A: ==true} +return score(); +[ +1 +] + +find {A: ==false} +return score(); +[ +1 +] + +find {A: ==null} +return score(); +[ +1 +] + +find {boolarray: [==true]} +return score(); +[ +1, +1 +] + + + +find {A: >10, A: <20} +return score(); +[ +1 +] + +find {A: >-10, A: <20} +return score(); +[ +1, +1 +] + +find {A: <-1} +return score(); +[ +1 +] + +find {A: <20} +return score(); +[ +1, +1 +] + +find {numberarray: [<70]} +return score(); +[ +1 +] + +find {A: <-3} +return score(); +[] + +find {A: <=-3} +return score(); +[ +1 +] + + + +find {A: >20} +return score(); +[ +1 +] + +find {A: >-5} +return score(); +[ +1, +1, +1 +] + +find {numberarray: [>40]} +return score(); +[ +1 +] + +find {A: >35} +return score(); +[] + +find {A: >=35 || NotAField: ==50} +return score(); +[ +0.25 +] + +find {A: >=35 && NotAField: ==50} +return score(); +[] + + +find {A: ==12} +return score(); +[ +1 +] + +find {numberarray: [==60]} +return score(); +[ +1 +] diff --git a/repl-tests/scoring.noise b/repl-tests/scoring.noise new file mode 100644 index 0000000..74622de --- /dev/null +++ b/repl-tests/scoring.noise @@ -0,0 +1,197 @@ +# Relevancy Scoring tests + +drop target/tests/querytestscore; +create target/tests/querytestscore; + + +add {"_id":"1", "bar": "fox"}; +"1" +add {"_id":"2", "bar": "quick fox"}; +"2" +add {"_id":"3", "bar": "quick brown fox"}; +"3" + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +order score() desc +return ._id ; +[ +"3", +"2", +"1" +] + +find {bar: ~="quick brown fox"} +order score() desc +return ._id ; +[ +"3" +] + +find {bar: ~="quick brown fox"} +return score() ; +[ +0.05966803431510925 +] + +find {bar: ~="quick brown fox"^2} +return score() ; +[ +0.1193360686302185 +] + +find {bar: =="quick brown fox"} +return score() ; +[ +1 +] + +find {bar: =="quick brown fox"^2} +return score() ; +[ +1 +] + +find {bar: ~2="quick brown fox"} +return score() ; +[ +0.0916677787899971 +] + +find {bar: ~2="quick brown fox"^2} +return score() ; +[ +0.1833355575799942 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +order score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find ({bar: ~="fox" || bar: ~="brown" || bar: ~="quick"})^2 +order score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +order score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"}^2 +order score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +order score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar: ~="fox"^2 || (bar: ~="brown" || bar: ~="quick")^2 } +order score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +order score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar: ~="fox"}^2 || {bar: ~="brown" || bar: ~="quick"}^2 +order score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +add {"_id":"4", "bar": ["fox"]}; +"4" +add {"_id":"5", "bar": ["quick fox"]}; +"5" +add {"_id":"6", "bar": ["quick brown fox"]}; +"6" + +find {bar:[ ~="fox" || ~="brown" || ~="quick"]} +order score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar:[~="fox" || ~="brown" || ~="quick"]^2} +order score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar:[ ~="fox" || ~="brown" || ~="quick"]} +order score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar:[~="fox"]^2 || bar:[~="brown" || ~="quick"]^2} +order score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar:[ ~="fox" || ~="brown" || ~="quick"]} +order score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar:[~="fox"]^2 || (bar:[~="brown"] || bar:[~="quick"])^2} +order score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] diff --git a/src/aggregates.rs b/src/aggregates.rs new file mode 100644 index 0000000..8a7e962 --- /dev/null +++ b/src/aggregates.rs @@ -0,0 +1,312 @@ + +use std::cmp::Ordering; + +use json_value::JsonValue; + +#[derive(PartialEq, Eq, Clone)] +pub enum AggregateFun { + GroupAsc, + GroupDesc, + Sum, + Max, + MaxArray, + Min, + MinArray, + Array, + ArrayFlat, + Concat, + Avg, + Count, +} + +pub struct AggregateFunImpls { + // Initalizes for a computing the aggregate action (optional) + pub init: Option JsonValue>, + + // The actual aggregate action function + pub action: fn(&mut JsonValue, JsonValue, &JsonValue), + + // extracts the final aggregate value (optional) + pub extract: Option, +} + +impl AggregateFun { + pub fn get_fun_impls(&self) -> AggregateFunImpls { + match self { + &AggregateFun::GroupAsc => panic!("cannot get aggregate fun for grouping!"), + &AggregateFun::GroupDesc => panic!("cannot get aggregate fun for grouping!"), + &AggregateFun::Sum => { + AggregateFunImpls { + init: Some(AggregateFun::sum_init), + action: AggregateFun::sum, + extract: None, + } + } + &AggregateFun::Max => { + AggregateFunImpls { + init: None, + action: AggregateFun::max, + extract: None, + } + } + &AggregateFun::Min => { + AggregateFunImpls { + init: None, + action: AggregateFun::min, + extract: None, + } + } + &AggregateFun::MaxArray => { + AggregateFunImpls { + init: Some(AggregateFun::max_array_init), + action: AggregateFun::max_array, + extract: None, + } + } + &AggregateFun::MinArray => { + AggregateFunImpls { + init: Some(AggregateFun::min_array_init), + action: AggregateFun::min_array, + extract: None, + } + } + &AggregateFun::Array => { + AggregateFunImpls { + init: Some(AggregateFun::array_init), + action: AggregateFun::array, + extract: None, + } + } + &AggregateFun::ArrayFlat => { + AggregateFunImpls { + init: Some(AggregateFun::array_flat_init), + action: AggregateFun::array_flat, + extract: None, + } + } + &AggregateFun::Concat => { + AggregateFunImpls { + init: Some(AggregateFun::concat_init), + action: AggregateFun::concat, + extract: None, + } + } + &AggregateFun::Avg => { + AggregateFunImpls { + init: Some(AggregateFun::avg_init), + action: AggregateFun::avg, + extract: Some(AggregateFun::avg_final), + } + } + &AggregateFun::Count => { + AggregateFunImpls { + init: Some(AggregateFun::count_init), + action: AggregateFun::count, + extract: None, + } + } + } + } + + fn sum_init(existing: JsonValue) -> JsonValue { + let mut base = JsonValue::Number(0.0); + AggregateFun::sum(&mut base, existing, &JsonValue::Null); + base + } + + fn sum(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + match new { + JsonValue::Number(new) => { + if let &mut JsonValue::Number(ref mut existing) = existing { + *existing += new; + } + } + JsonValue::Array(vec) => { + for v in vec { + AggregateFun::sum(existing, v, _user_arg); + } + } + _ => (), + } + } + + fn max(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if *existing < new { + *existing = new + } + } + + fn min(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if *existing > new { + *existing = new + } + } + + fn max_array_init(existing: JsonValue) -> JsonValue { + // The default value is an array, which can never be a value because arrays are always + // traversed. It's possible we never encounter a value due to only encountering empty + // arrays, in which case the final value is an empty array meaning no values encountered. + let mut val = JsonValue::Array(vec![]); + AggregateFun::max_array(&mut val, existing, &JsonValue::Null); + val + } + + fn max_array(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec { + AggregateFun::max_array(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(_) = existing { + *existing = new; + } else if (*existing).cmp(&new) == Ordering::Less { + *existing = new; + } + } + } + + fn min_array_init(existing: JsonValue) -> JsonValue { + // The default value is an array, which can never be a value because arrays are always + // traversed. It's possible we never encounter a value due to only encountering empty + // arrays, in which case the final value is an empty array meaning no values encountered. + let mut val = JsonValue::Array(vec![]); + AggregateFun::min_array(&mut val, existing, &JsonValue::Null); + val + } + + fn min_array(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec { + AggregateFun::min_array(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(_) = existing { + *existing = new; + } else if (*existing).cmp(&new) == Ordering::Greater { + *existing = new; + } + } + } + + fn array_init(existing: JsonValue) -> JsonValue { + JsonValue::Array(vec![existing]) + } + + fn array(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let &mut JsonValue::Array(ref mut existing) = existing { + existing.push(new); + } + } + + fn array_flat_init(existing: JsonValue) -> JsonValue { + let mut new = JsonValue::Array(vec![]); + AggregateFun::array_flat(&mut new, existing, &JsonValue::Null); + new + } + + fn array_flat(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec.into_iter() { + AggregateFun::array_flat(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(ref mut existing) = existing { + existing.push(new); + } + } + } + + fn concat_init(existing: JsonValue) -> JsonValue { + if let JsonValue::String(_) = existing { + existing + } else { + JsonValue::String(String::new()) + } + } + + fn concat(existing: &mut JsonValue, new: JsonValue, user_arg: &JsonValue) { + if let &mut JsonValue::String(ref mut existing) = existing { + if let JsonValue::String(new) = new { + if let &JsonValue::String(ref user_arg) = user_arg { + existing.push_str(&user_arg); + existing.push_str(&new); + } + } + } + } + + fn avg_init(existing: JsonValue) -> JsonValue { + if let JsonValue::Number(_) = existing { + JsonValue::Array(vec![existing, JsonValue::Number(1.0)]) + } else if let JsonValue::Array(_) = existing { + let mut avg = JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]); + AggregateFun::avg(&mut avg, existing, &JsonValue::Null); + avg + } else { + JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]) + } + } + + fn avg(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Number(new) = new { + if let &mut JsonValue::Array(ref mut array) = existing { + let mut avg = if let &JsonValue::Number(ref avg) = &array[0] { + *avg + } else { + // can't happen but compiler need this here + 1.0 + }; + + let mut count = if let &JsonValue::Number(ref count) = &array[1] { + *count + } else { + // can't happen but compiler need this here + 1.0 + }; + + avg = (avg * count + new) / (count + 1.0); + count += 1.0; + array[0] = JsonValue::Number(avg); + array[1] = JsonValue::Number(count); + } + } else if let JsonValue::Array(vec) = new { + for v in vec.into_iter() { + AggregateFun::avg(existing, v, _user_arg); + } + } + } + + fn avg_final(existing: &mut JsonValue) { + let json = if let &mut JsonValue::Array(ref mut array) = existing { + if let &JsonValue::Number(ref avg) = &array[0] { + if let &JsonValue::Number(ref count) = &array[1] { + if *count == 0.0 { + JsonValue::Null + } else { + JsonValue::Number(*avg) + } + } else { + // can't happen but compiler need this here + JsonValue::Null + } + } else { + // can't happen but compiler need this here + JsonValue::Null + } + } else { + // can't happen but compiler need this here + JsonValue::Null + }; + *existing = json + } + + fn count_init(_existing: JsonValue) -> JsonValue { + JsonValue::Number(1.0) + } + + fn count(existing: &mut JsonValue, _: JsonValue, _user_arg: &JsonValue) { + if let &mut JsonValue::Number(ref mut num) = existing { + *num += 1.0; + } + } +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..cb59532 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,85 @@ +extern crate rocksdb; + +use std::{error, fmt}; +use std::num::ParseIntError; +use std::num::ParseFloatError; +use std::io; + + +#[derive(Debug)] +pub enum Error { + Parse(String), + Shred(String), + Rocks(rocksdb::Error), + Write(String), + Io(io::Error), +} + +impl PartialEq for Error { + fn eq(&self, other: &Error) -> bool { + self == other + } +} + +impl error::Error for Error { + fn description(&self) -> &str { + match *self { + Error::Parse(ref description) => description, + Error::Shred(ref description) => description, + // XXX vmx 2016-11-07: It should be fixed on the RocksDB wrapper + // that it has the std::error:Error implemented and hence + // and err.description() + Error::Rocks(_) => "This is an rocksdb error", + Error::Write(ref description) => description, + Error::Io(ref err) => err.description(), + } + } + + fn cause(&self) -> Option<&error::Error> { + match *self { + Error::Parse(_) => None, + Error::Shred(_) => None, + // NOTE vmx 2016-11-07: Looks like the RocksDB Wrapper needs to be + // patched to be based on the std::error::Error trait + Error::Rocks(_) => None, + Error::Write(_) => None, + Error::Io(ref err) => Some(err as &error::Error), + } + } +} + +impl From for Error { + fn from(err: rocksdb::Error) -> Error { + Error::Rocks(err) + } +} + +impl From for Error { + fn from(err: ParseIntError) -> Error { + Error::Parse(err.to_string()) + } +} + +impl From for Error { + fn from(err: ParseFloatError) -> Error { + Error::Parse(err.to_string()) + } +} + +impl From for Error { + fn from(err: io::Error) -> Error { + Error::Io(err) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::Parse(ref err) => write!(f, "Parse error: {}", err), + Error::Shred(ref err) => write!(f, "Shred error: {}", err), + Error::Rocks(ref err) => write!(f, "RocksDB error: {}", err), + Error::Write(ref err) => write!(f, "Write error: {}", err), + Error::Io(ref err) => write!(f, "Io error: {}", err), + } + } +} diff --git a/src/filters.rs b/src/filters.rs new file mode 100644 index 0000000..83f19d4 --- /dev/null +++ b/src/filters.rs @@ -0,0 +1,1139 @@ +use std::{mem, str}; +use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::collections::HashSet; + +use error::Error; +use key_builder::KeyBuilder; +use query::{DocResult, QueryScoringInfo}; +use json_value::JsonValue; +use snapshot::{Snapshot, DocResultIterator, Scorer, JsonFetcher, AllDocsIterator}; +use rocksdb::{self, DBIterator, IteratorMode}; + +pub trait QueryRuntimeFilter { + fn first_result(&mut self, start: &DocResult) -> Option; + fn next_result(&mut self) -> Option; + fn prepare_relevancy_scoring(&mut self, qsi: &mut QueryScoringInfo); + + /// returns error is a double negation is detected + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error>; + + /// return true if filter or all subfilters are NotFilters + fn is_all_not(&self) -> bool; +} + + +#[derive(PartialEq)] +pub enum RangeOperator { + Inclusive(f64), + Exclusive(f64), + // For booleans and null only exact match makes sense, hence no inclusive/exclusive + // boundaries are needed + True, + False, + Null, +} + + + +pub struct AllDocsFilter { + iter: AllDocsIterator, +} + +impl AllDocsFilter { + pub fn new(snapshot: &Snapshot) -> AllDocsFilter { + AllDocsFilter { iter: snapshot.new_all_docs_iterator() } + } +} + +impl QueryRuntimeFilter for AllDocsFilter { + fn first_result(&mut self, _start: &DocResult) -> Option { + self.next_result() + } + + fn next_result(&mut self) -> Option { + if let Some(mut dr) = self.iter.next() { + dr.add_score(1, 1.0); + Some(dr) + } else { + None + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += 1.0; + } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } +} + +pub struct StemmedWordFilter { + iter: DocResultIterator, + scorer: Scorer, +} + +impl StemmedWordFilter { + pub fn new(snapshot: &Snapshot, + stemmed_word: &str, + kb: &KeyBuilder, + boost: f32) + -> StemmedWordFilter { + StemmedWordFilter { + iter: snapshot.new_term_doc_result_iterator(stemmed_word, kb), + scorer: snapshot.new_scorer(stemmed_word, kb, boost), + } + } +} + +impl QueryRuntimeFilter for StemmedWordFilter { + fn first_result(&mut self, start: &DocResult) -> Option { + self.iter.advance_gte(start); + self.next_result() + } + + fn next_result(&mut self) -> Option { + if let Some((mut dr, pos)) = self.iter.next() { + if self.scorer.should_score() { + let count = pos.positions().len(); + self.scorer.add_match_score(count as u32, &mut dr); + } + Some(dr) + } else { + None + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.scorer.init(&mut qsi); + } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } +} + +/// This is not a QueryRuntimeFilter but it imitates one. Instead of returning just a DocResult +/// it also return a vector of word positions, each being a instance of the word occurance +pub struct StemmedWordPosFilter { + iter: DocResultIterator, + scorer: Scorer, +} + +impl StemmedWordPosFilter { + pub fn new(snapshot: &Snapshot, + stemmed_word: &str, + kb: &KeyBuilder, + boost: f32) + -> StemmedWordPosFilter { + StemmedWordPosFilter { + iter: snapshot.new_term_doc_result_iterator(stemmed_word, kb), + scorer: snapshot.new_scorer(&stemmed_word, &kb, boost), + } + } + + fn first_result(&mut self, start: &DocResult) -> Option<(DocResult, Vec)> { + self.iter.advance_gte(start); + self.next_result() + } + + fn next_result(&mut self) -> Option<(DocResult, Vec)> { + if let Some((mut dr, pos)) = self.iter.next() { + let positions = pos.positions(); + if self.scorer.should_score() { + let count = positions.len(); + self.scorer.add_match_score(count as u32, &mut dr); + } + Some((dr, positions)) + } else { + None + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.scorer.init(&mut qsi); + } +} + +pub struct StemmedPhraseFilter { + filters: Vec, +} + +impl StemmedPhraseFilter { + pub fn new(filters: Vec) -> StemmedPhraseFilter { + assert!(filters.len() > 0); + StemmedPhraseFilter { filters: filters } + } + + fn result(&mut self, base: Option<(DocResult, Vec)>) -> Option { + // this is the number of matches left before all terms match and we can return a result + let mut matches_left = self.filters.len() - 1; + + if base.is_none() { + return None; + } + let (mut base_result, mut base_positions) = base.unwrap(); + + if matches_left == 0 { + return Some(base_result); + } + + let mut current_filter = 0; + loop { + current_filter += 1; + if current_filter == self.filters.len() { + current_filter = 0; + } + + let next = self.filters[current_filter].first_result(&base_result); + + if next.is_none() { + return None; + } + let (next_result, next_positions) = next.unwrap(); + + if base_result == next_result { + let mut new_positions = Vec::new(); + for &pos in next_positions.iter() { + if let Ok(_) = base_positions.binary_search(&(pos.saturating_sub(1))) { + new_positions.push(pos); + } + } + if new_positions.len() > 0 { + // we have valus that survive! reassign back to base_positions + base_positions = new_positions; + matches_left -= 1; + + if matches_left == 0 { + return Some(base_result); + } + } else { + // we didn't match on phrase, so get next_result from first filter + current_filter = 0; + let next = self.filters[current_filter].next_result(); + if next.is_none() { + return None; + } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions; + + matches_left = self.filters.len() - 1; + } + } else { + // we didn't match on next_result, so get first_result at next_result on + // 1st filter. + current_filter = 0; + let next = self.filters[current_filter].first_result(&next_result); + if next.is_none() { + return None; + } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions; + + matches_left = self.filters.len() - 1; + } + } + } +} + + +impl QueryRuntimeFilter for StemmedPhraseFilter { + fn first_result(&mut self, start: &DocResult) -> Option { + let base_result = self.filters[0].first_result(start); + self.result(base_result) + } + + fn next_result(&mut self) -> Option { + let base_result = self.filters[0].next_result(); + self.result(base_result) + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + for f in self.filters.iter_mut() { + f.prepare_relevancy_scoring(&mut qsi); + } + } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } +} + + +pub struct ExactMatchFilter { + iter: DBIterator, + filter: StemmedPhraseFilter, + kb: KeyBuilder, + phrase: String, + case_sensitive: bool, + term_ordinal: Option, +} + +impl ExactMatchFilter { + pub fn new(snapshot: &Snapshot, + filter: StemmedPhraseFilter, + kb: KeyBuilder, + phrase: String, + case_sensitive: bool) + -> ExactMatchFilter { + ExactMatchFilter { + iter: snapshot.new_iterator(), + filter: filter, + kb: kb, + phrase: if case_sensitive { + phrase + } else { + phrase.to_lowercase() + }, + case_sensitive: case_sensitive, + term_ordinal: None, + } + } + + fn check_exact(&mut self, mut dr: DocResult) -> Option { + loop { + let value_key = self.kb.value_key_from_doc_result(&dr); + + self.iter + .set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); + + if let Some((key, value)) = self.iter.next() { + debug_assert!(key.starts_with(value_key.as_bytes())); // must always be true! + if let JsonValue::String(string) = JsonFetcher::bytes_to_json_value(&*value) { + let matches = if self.case_sensitive { + self.phrase == string + } else { + self.phrase == string.to_lowercase() + }; + if matches { + if self.term_ordinal.is_some() { + dr.add_score(self.term_ordinal.unwrap(), 1.0); + } + return Some(dr); + } else { + if let Some(next) = self.filter.next_result() { + dr = next; + // continue looping + } else { + return None; + } + } + } else { + panic!("Not a string, wtf!"); + } + } else { + panic!("Couldn't find value, hulk smash!"); + } + } + } +} + +impl QueryRuntimeFilter for ExactMatchFilter { + fn first_result(&mut self, start: &DocResult) -> Option { + if let Some(dr) = self.filter.first_result(start) { + self.check_exact(dr) + } else { + None + } + } + + fn next_result(&mut self) -> Option { + if let Some(dr) = self.filter.next_result() { + self.check_exact(dr) + } else { + None + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + // we score these as binary. Either they have a value of 1 or nothing. + self.term_ordinal = Some(qsi.num_terms); + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += 1.0; + } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + self.filter.check_double_not(parent_is_neg) + } + + fn is_all_not(&self) -> bool { + self.filter.is_all_not() + } +} + +pub struct RangeFilter { + iter: DBIterator, + kb: KeyBuilder, + min: Option, + max: Option, + keypath: String, + term_ordinal: Option, +} + +impl RangeFilter { + pub fn new(snapshot: &Snapshot, + kb: KeyBuilder, + min: Option, + max: Option) + -> RangeFilter { + RangeFilter { + iter: snapshot.new_iterator(), + kb: kb, + min: min, + max: max, + // The keypath we use to seek to the correct key within RocksDB + keypath: String::new(), + term_ordinal: None, + } + } +} + +impl QueryRuntimeFilter for RangeFilter { + fn first_result(&mut self, start: &DocResult) -> Option { + let mut value_key = { + // `min` and `max` have the save type, so picking one is OK + let range_operator = self.min.as_ref().or(self.max.as_ref()).unwrap(); + match range_operator { + &RangeOperator::Inclusive(_) | + &RangeOperator::Exclusive(_) => self.kb.number_key(start.seq), + &RangeOperator::True => self.kb.bool_null_key('T', start.seq), + &RangeOperator::False => self.kb.bool_null_key('F', start.seq), + &RangeOperator::Null => self.kb.bool_null_key('N', start.seq), + } + }; + // NOTE vmx 2017-04-13: Iterating over keys is really similar to the + // `DocResultIterator` in `snapshot.rs`. It should probablly be unified. + self.iter + .set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); + KeyBuilder::truncate_to_keypathword(&mut value_key); + self.keypath = value_key; + self.next_result() + } + + fn next_result(&mut self) -> Option { + while let Some((key, value)) = self.iter.next() { + if !key.starts_with(self.keypath.as_bytes()) { + // we passed the key path we are interested in. nothing left to do + return None; + } + + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + + // The key already matched, hence it's a valid doc result. Return it. + if self.min == Some(RangeOperator::True) || self.min == Some(RangeOperator::False) || + self.min == Some(RangeOperator::Null) { + let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); + if self.term_ordinal.is_some() { + dr.add_score(self.term_ordinal.unwrap(), 1.0); + } + return Some(dr); + } + // Else it's a range query on numbers + + let number = unsafe { + let array = *(value[..].as_ptr() as *const [_; 8]); + mem::transmute::<[u8; 8], f64>(array) + }; + + let min_condition = match self.min { + Some(RangeOperator::Inclusive(min)) => number >= min, + Some(RangeOperator::Exclusive(min)) => number > min, + // No condition was given => it always matches + None => true, + _ => panic!("Can't happen, it returns early on the other types"), + }; + let max_condition = match self.max { + Some(RangeOperator::Inclusive(max)) => number <= max, + Some(RangeOperator::Exclusive(max)) => number < max, + // No condition was given => it always matches + None => true, + _ => panic!("Can't happen, it returns early on the other types"), + }; + + if min_condition && max_condition { + let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); + if self.term_ordinal.is_some() { + dr.add_score(self.term_ordinal.unwrap(), 1.0); + } + return Some(dr); + } + // Else: No match => KKeep looping and move on to the next key + } + None + } + + // TODO vmx 2017-04-13: Scoring is not implemented yet + fn prepare_relevancy_scoring(&mut self, qsi: &mut QueryScoringInfo) { + // we score these as binary. Either they have a value of 1 or nothing. + self.term_ordinal = Some(qsi.num_terms); + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += 1.0; + } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } +} + + +pub struct DistanceFilter { + filters: Vec, + current_filter: usize, + distance: u32, +} + +impl DistanceFilter { + pub fn new(filters: Vec, distance: u32) -> DistanceFilter { + DistanceFilter { + filters: filters, + current_filter: 0, + distance: distance, + } + } + + fn result(&mut self, base: Option<(DocResult, Vec)>) -> Option { + // yes this code complex. I tried to break it up, but it wants to be like this. + + // this is the number of matches left before all terms match and we can return a result + let mut matches_left = self.filters.len() - 1; + + if base.is_none() { + return None; + } + let (mut base_result, positions) = base.unwrap(); + + // This contains tuples of word postions and the filter they came from, + // sorted by word position. + let mut base_positions: Vec<(u32, usize)> = positions + .iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); + + // distance is number of words between searched words. + // add one to make calculating difference easier since abs(posa - posb) == distance + 1 + let dis = self.distance + 1; + loop { + self.current_filter += 1; + if self.current_filter == self.filters.len() { + self.current_filter = 0; + } + + let next = self.filters[self.current_filter].first_result(&base_result); + + if next.is_none() { + return None; + } + let (next_result, next_positions) = next.unwrap(); + + if base_result != next_result { + // not same field, next_result becomes base_result. + base_result = next_result; + base_positions = next_positions + .iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); + + matches_left = self.filters.len() - 1; + continue; + } + // so we are in the same field. Now to check the proximity of the values from the + // next result to previous results. + + // new_positions_map will accept positions within range of pos. But only if all + // positions that can be are within range. We use the sorted map so we can add + // the same positions multiple times and it's a noop. + let mut new_positions_map = BTreeMap::new(); + for &pos in next_positions.iter() { + // coud these lines be any longer? No they could not. + let sub = pos.saturating_sub(dis); // underflows othewises + let start = match base_positions.binary_search_by_key(&(sub), |&(pos2, _)| pos2) { + Ok(start) => start, + Err(start) => start, + }; + + let end = match base_positions.binary_search_by_key(&(pos + dis), + |&(pos2, _)| pos2) { + Ok(end) => end, + Err(end) => end, + }; + + // we now collect all the filters within the range + let mut filters_encountered = HashSet::new(); + for &(_, filter_n) in base_positions[start..end].iter() { + filters_encountered.insert(filter_n); + } + + if filters_encountered.len() == self.filters.len() - matches_left { + // we encountered all the filters we can at this stage, + // so we should add them all to the new_positions_map + for &(prev_pos, filter_n) in base_positions[start..end].iter() { + new_positions_map.insert(prev_pos, filter_n); + } + // and add the current pos + new_positions_map.insert(pos, self.current_filter); + } + } + if new_positions_map.len() > 0 { + // we have valus that survive! reassign back to positions + base_positions = new_positions_map.into_iter().collect(); + matches_left -= 1; + + if matches_left == 0 { + return Some(base_result); + } else { + continue; + } + } + // we didn't match on next_result, so get next_result on current filter + let next = self.filters[self.current_filter].next_result(); + + if next.is_none() { + return None; + } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions + .iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); + + matches_left = self.filters.len() - 1; + } + } +} + +impl QueryRuntimeFilter for DistanceFilter { + fn first_result(&mut self, start: &DocResult) -> Option { + let base_result = self.filters[self.current_filter].first_result(start); + self.result(base_result) + } + + fn next_result(&mut self) -> Option { + let base_result = self.filters[self.current_filter].next_result(); + self.result(base_result) + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + for f in self.filters.iter_mut() { + f.prepare_relevancy_scoring(&mut qsi); + } + } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } +} + + +pub struct AndFilter<'a> { + filters: Vec>, + current_filter: usize, + array_depth: usize, +} + +impl<'a> AndFilter<'a> { + pub fn new(filters: Vec>, array_depth: usize) -> AndFilter<'a> { + AndFilter { + filters: filters, + current_filter: 0, + array_depth: array_depth, + } + } + + fn result(&mut self, base: Option) -> Option { + let mut matches_count = self.filters.len() - 1; + + if base.is_none() { + return None; + } + let mut base_result = base.unwrap(); + + base_result.arraypath.resize(self.array_depth, 0); + + loop { + self.current_filter += 1; + if self.current_filter == self.filters.len() { + self.current_filter = 0; + } + + let next = self.filters[self.current_filter].first_result(&base_result); + + if next.is_none() { + return None; + } + let mut next_result = next.unwrap(); + + next_result.arraypath.resize(self.array_depth, 0); + + if base_result == next_result { + matches_count -= 1; + base_result.combine(&mut next_result); + if matches_count == 0 { + return Some(base_result); + } + } else { + base_result = next_result; + matches_count = self.filters.len() - 1; + } + } + } +} + +impl<'a> QueryRuntimeFilter for AndFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Option { + let base_result = self.filters[self.current_filter].first_result(start); + self.result(base_result) + } + + fn next_result(&mut self) -> Option { + let base_result = self.filters[self.current_filter].next_result(); + self.result(base_result) + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + for f in self.filters.iter_mut() { + f.prepare_relevancy_scoring(&mut qsi); + } + } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + for f in self.filters.iter() { + try!(f.check_double_not(parent_is_neg)); + } + Ok(()) + } + + fn is_all_not(&self) -> bool { + for f in self.filters.iter() { + if !f.is_all_not() { + return false; + } + } + true + } +} + +/// Used by OrFilter to maintain a already fetched result so we don't refetch when one side isn't +/// returned to caller. Because we won't know which side gets returned until both sides are +/// fetched. +pub struct FilterWithResult<'a> { + filter: Box, + result: Option, + is_done: bool, + array_depth: usize, +} + +impl<'a> FilterWithResult<'a> { + fn prime_first_result(&mut self, start: &DocResult) { + if self.is_done { + return; + } + if self.result.is_none() { + self.result = self.filter.first_result(start); + } else if self.result + .as_ref() + .unwrap() + .less(start, self.array_depth) { + self.result = self.filter.first_result(start); + } + if self.result.is_none() { + self.is_done = true; + } else { + self.result + .as_mut() + .unwrap() + .arraypath + .resize(self.array_depth, 0); + } + } + + fn prime_next_result(&mut self) { + if self.is_done { + return; + } + if self.result.is_none() { + self.result = self.filter.next_result(); + } + if self.result.is_none() { + self.is_done = true; + } else { + self.result + .as_mut() + .unwrap() + .arraypath + .resize(self.array_depth, 0); + } + } +} + +pub struct OrFilter<'a> { + left: FilterWithResult<'a>, + right: FilterWithResult<'a>, +} + +impl<'a> OrFilter<'a> { + pub fn new(left: Box, + right: Box, + array_depth: usize) + -> OrFilter<'a> { + OrFilter { + left: FilterWithResult { + filter: left, + result: None, + array_depth: array_depth, + is_done: false, + }, + + right: FilterWithResult { + filter: right, + result: None, + array_depth: array_depth, + is_done: false, + }, + } + } + fn take_smallest(&mut self) -> Option { + if let Some(mut left) = self.left.result.take() { + // left exists + if let Some(mut right) = self.right.result.take() { + // both exist, return smallest + match left.cmp(&right) { + Ordering::Less => { + // left is smallest, return and put back right + self.right.result = Some(right); + Some(left) + } + Ordering::Greater => { + // right is smallest, return and put back left + self.left.result = Some(left); + Some(right) + } + Ordering::Equal => { + left.combine(&mut right); + self.right.result = Some(right); + Some(left) + } + } + } else { + // right doesn't exist. return left + Some(left) + } + } else { + // left doesn't exist + if self.right.result.is_some() { + // right exists. return it + self.right.result.take() + } else { + // neither exists. return none + None + } + } + } +} + +impl<'a> QueryRuntimeFilter for OrFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Option { + self.left.prime_first_result(start); + self.right.prime_first_result(start); + self.take_smallest() + } + + fn next_result(&mut self) -> Option { + self.left.prime_next_result(); + self.right.prime_next_result(); + self.take_smallest() + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.left.filter.prepare_relevancy_scoring(&mut qsi); + self.right.filter.prepare_relevancy_scoring(&mut qsi); + } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + try!(self.left.filter.check_double_not(parent_is_neg)); + try!(self.right.filter.check_double_not(parent_is_neg)); + Ok(()) + } + + fn is_all_not(&self) -> bool { + if self.left.filter.is_all_not() && self.right.filter.is_all_not() { + true + } else { + false + } + } +} + + +pub struct NotFilter<'a> { + iter: DBIterator, + filter: Box, + last_doc_returned: Option, + kb: KeyBuilder, +} + +impl<'a> NotFilter<'a> { + pub fn new(snapshot: &Snapshot, + filter: Box, + kb: KeyBuilder) + -> NotFilter<'a> { + NotFilter { + iter: snapshot.new_iterator(), + filter: filter, + last_doc_returned: Some(DocResult::new()), + kb: kb, + } + } + + fn is_a_not_match(&mut self, dr: &DocResult) -> bool { + let ret = match dr.last_segment_array_index() { + Some(&0) => { + // if we got a (not) match on the first array element, it's always a match + // but only if the document actually exists. + true + } + Some(_) => { + // if we got a (not) match on any other element, check to make sure the key exists. + // if not, it means other elements did a regular match and skipped them, then we + // ran off the end of the array. + let value_key = self.kb.value_key_from_doc_result(&dr); + self.iter + .set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + if let Some((key, _value)) = self.iter.next() { + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + KeyBuilder::is_keypath_prefix(&value_key, &key_str) + } else { + false + } + } + None => { + //not an array. always a (not) match. + true + } + }; + if ret { + // make sure we actually have a document. It's possible we matched a non-existent seq. + let mut kb = KeyBuilder::new(); + kb.push_object_key("_id"); + let value_key = kb.value_key_from_doc_result(dr); + self.iter + .set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); + if let Some((key, _value)) = self.iter.next() { + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + value_key == key_str + } else { + false + } + } else { + false + } + } +} + +impl<'a> QueryRuntimeFilter for NotFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Option { + let mut start = start.clone_only_seq_and_arraypath(); + start.arraypath.resize(self.kb.arraypath_len(), 0); + while let Some(dr) = self.filter.first_result(&start) { + if start.less(&dr, self.kb.arraypath_len()) { + if self.is_a_not_match(&start) { + self.last_doc_returned = Some(start.clone_only_seq_and_arraypath()); + return Some(start.clone_only_seq_and_arraypath()); + } else { + start.increment_first(self.kb.arraypath_len()); + } + } else { + start.increment_last(self.kb.arraypath_len()); + } + } + self.last_doc_returned = None; + if self.is_a_not_match(&start) { + Some(start) + } else { + None + } + } + + fn next_result(&mut self) -> Option { + if let Some(mut next) = self.last_doc_returned.take() { + next.increment_last(self.kb.arraypath_len()); + self.first_result(&next) + } else { + None + } + } + + fn prepare_relevancy_scoring(&mut self, _qsi: &mut QueryScoringInfo) { + // no op + } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + if parent_is_neg { + return Err(Error::Parse("Logical not (\"!\") is nested inside of another logical not. \ + This is not allowed." + .to_string())); + } + try!(self.filter.check_double_not(true)); + Ok(()) + } + + fn is_all_not(&self) -> bool { + true + } +} + +pub struct BindFilter<'a> { + bind_var_name: String, + filter: Box, + array_depth: usize, + kb: KeyBuilder, + option_next: Option, +} + +impl<'a> BindFilter<'a> { + pub fn new(bind_var_name: String, + filter: Box, + kb: KeyBuilder) + -> BindFilter { + BindFilter { + bind_var_name: bind_var_name, + filter: filter, + array_depth: kb.arraypath_len(), + kb: kb, + option_next: None, + } + } + + fn collect_results(&mut self, mut first: DocResult) -> Option { + let value_key = self.kb.value_key_from_doc_result(&first); + first.add_bind_name_result(&self.bind_var_name, value_key); + + while let Some(next) = self.filter.next_result() { + if next.seq == first.seq { + let value_key = self.kb.value_key_from_doc_result(&next); + first.add_bind_name_result(&self.bind_var_name, value_key); + } else { + self.option_next = Some(next); + return Some(first); + } + } + Some(first) + } +} + +impl<'a> QueryRuntimeFilter for BindFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Option { + let first = if let Some(next) = self.option_next.take() { + if start.less(&next, self.array_depth) { + Some(next) + } else { + self.filter.first_result(&start) + } + } else { + self.filter.first_result(&start) + }; + + if let Some(first) = first { + self.collect_results(first) + } else { + None + } + } + + fn next_result(&mut self) -> Option { + let first = if let Some(next) = self.option_next.take() { + Some(next) + } else { + self.filter.next_result() + }; + + if let Some(first) = first { + self.collect_results(first) + } else { + None + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.filter.prepare_relevancy_scoring(&mut qsi); + } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + self.filter.check_double_not(parent_is_neg) + } + + fn is_all_not(&self) -> bool { + self.filter.is_all_not() + } +} + +pub struct BoostFilter<'a> { + filter: Box, + boost: f32, +} + +impl<'a> BoostFilter<'a> { + pub fn new(filter: Box, boost: f32) -> BoostFilter { + BoostFilter { + filter: filter, + boost: boost, + } + } +} + +impl<'a> QueryRuntimeFilter for BoostFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Option { + if let Some(mut dr) = self.filter.first_result(&start) { + dr.boost_scores(self.boost); + Some(dr) + } else { + None + } + } + + fn next_result(&mut self) -> Option { + if let Some(mut dr) = self.filter.next_result() { + dr.boost_scores(self.boost); + Some(dr) + } else { + None + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.filter.prepare_relevancy_scoring(&mut qsi); + } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + self.filter.check_double_not(parent_is_neg) + } + + fn is_all_not(&self) -> bool { + self.filter.is_all_not() + } +} diff --git a/src/index.rs b/src/index.rs index 1e644ef..6a37c1c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,185 +1,480 @@ extern crate rocksdb; +extern crate varint; +extern crate uuid; -use std::collections::HashMap; +use std::collections::{HashSet, BTreeMap}; +use std::str; +use std::io::Cursor; +use std::mem; +use std::io::Write; +use self::uuid::{Uuid, UuidVersion}; +use std::cmp::Ordering; -use records_capnp::header; -// Needed for a trait in order to `dekete/put()` into a `rocksdb::WriteBatch` -use self::rocksdb::Writable; +use self::varint::{VarintRead, VarintWrite}; -use json_shred::{Shredder}; +use rocksdb::{MergeOperands, IteratorMode, Snapshot as RocksSnapshot, CompactionDecision}; +pub use rocksdb::WriteBatch; +use error::Error; +use json_shred::Shredder; +use key_builder::KeyBuilder; +use snapshot::Snapshot; const NOISE_HEADER_VERSION: u64 = 1; -struct Header { - version: u64, - high_seq: u64, +pub struct Index { + name: String, + high_doc_seq: u64, + pub rocks: Option, } -impl Header { - fn new() -> Header { - Header{ - version: NOISE_HEADER_VERSION, - high_seq: 0, - } - } - fn serialize(&self) -> Vec { - let mut message = ::capnp::message::Builder::new_default(); - { - let mut header = message.init_root::(); - header.set_version(self.version); - header.set_high_seq(self.high_seq); - } - let mut bytes = Vec::new(); - ::capnp::serialize_packed::write_message(&mut bytes, &message).unwrap(); - bytes - } +pub struct Batch { + wb: rocksdb::WriteBatch, + id_str_in_batch: HashSet, } - -pub struct Index { - read_options: rocksdb::ReadOptions, - write_options: rocksdb::WriteOptions, - high_doc_seq: u64, - pub rocks: Option, - id_str_to_id_seq: HashMap, - batch: rocksdb::WriteBatch, +impl Batch { + pub fn new() -> Batch { + Batch { + wb: rocksdb::WriteBatch::default(), + id_str_in_batch: HashSet::new(), + } + } } pub enum OpenOptions { - Create + Create, } impl Index { pub fn new() -> Index { Index { - read_options: rocksdb::ReadOptions::new(), - write_options: rocksdb::WriteOptions::new(), + name: String::new(), high_doc_seq: 0, rocks: None, - id_str_to_id_seq: HashMap::new(), - batch: rocksdb::WriteBatch::new(), } } // NOTE vmx 2016-10-13: Perhpas the name should be specified on `new()` as it is bound // to a single instance. The next question would then be if `open()` really makes sense // or if it should be combined with `new()`. //fn open(&mut self, name: &str, open_options: Option) -> Result { - pub fn open(&mut self, name: &str, open_options: Option) -> Result<(), String> { - let mut rocks_options = rocksdb::Options::new(); - println!("still here1"); + pub fn open(&mut self, name: &str, open_options: Option) -> Result<(), Error> { + let mut rocks_options = rocksdb::Options::default(); + rocks_options.set_comparator("noise_cmp", Index::compare_keys); + rocks_options.set_merge_operator("noise_merge", Index::sum_merge); + rocks_options.set_compaction_filter("noise_compact", Index::compaction_filter); + let rocks = match rocksdb::DB::open(&rocks_options, name) { Ok(rocks) => rocks, Err(error) => { match open_options { Some(OpenOptions::Create) => (), - _ => return Err(error), + _ => return Err(Error::Rocks(error)), } rocks_options.create_if_missing(true); let rocks = try!(rocksdb::DB::open(&rocks_options, name)); - let header = Header::new(); - let status = rocks.put_opt(b"HDB", &*header.serialize(), &self.write_options); - println!("put was ok? {}", status.is_ok()); + let mut bytes = Vec::with_capacity(8 * 2); + bytes + .write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)) + .unwrap(); + bytes.write(&Index::convert_u64_to_bytes(0)).unwrap(); + try!(rocks.put_opt(b"HDB", &bytes, &rocksdb::WriteOptions::new())); + rocks } }; // validate header is there - let value = try!(rocks.get_opt(b"HDB", &self.read_options)).unwrap(); - // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed - // and why we pass on mutable reference of it to `read_message()` - let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); - let header = message_reader.get_root::().unwrap(); - assert_eq!(header.get_version(), NOISE_HEADER_VERSION); - self.high_doc_seq = header.get_high_seq(); + let value = try!(rocks.get(b"HDB")).unwrap(); self.rocks = Some(rocks); + assert_eq!(value.len(), 8 * 2); + // first 8 is version + assert_eq!(Index::convert_bytes_to_u64(&value[..8]), + NOISE_HEADER_VERSION); + // next 8 is high seq + self.high_doc_seq = Index::convert_bytes_to_u64(&value[8..]); + self.name = name.to_string(); Ok(()) } - // NOTE vmx 2016-10-13: As one index is tied to one database, this should be a method - // without a parameter - pub fn delete(name: &str) -> Result<(), String> { - rocksdb::DB::destroy(&rocksdb::Options::new(), name) + pub fn is_open(&self) -> bool { + self.rocks.is_some() + } + + pub fn get_name(&self) -> &str { + &self.name + } + + pub fn new_snapshot(&self) -> Snapshot { + Snapshot::new(RocksSnapshot::new(self.rocks.as_ref().unwrap())) + } + + //This deletes the Rockdbs instance from disk + pub fn drop(name: &str) -> Result<(), Error> { + let ret = try!(rocksdb::DB::destroy(&rocksdb::Options::default(), name)); + Ok(ret) } - pub fn add(&mut self, json: &str) -> Result<(), String> { + pub fn add(&mut self, json: &str, mut batch: &mut Batch) -> Result { + if !self.is_open() { + return Err(Error::Write("Index isn't open.".to_string())); + } let mut shredder = Shredder::new(); - // NOTE vmx 2016-10-13: Needed for the lifetime-checker, though not sure if it now really - // does the right thing. Does the `try!()` still return as epected? - { - let docid = try!(shredder.shred(json, self.high_doc_seq + 1)); + let (seq, docid) = if let Some(docid) = try!(shredder.shred(json)) { + // user supplied doc id, see if we have an existing one. + if batch.id_str_in_batch.contains(&docid) { + // oops use trying to add some doc 2x to this batch. + return Err(Error::Write("Attempt to insert multiple docs with same _id" + .to_string())); + } + if let Some((seq, existing_key_values)) = try!(self.gather_doc_fields(&docid)) { + shredder.merge_existing_doc(existing_key_values); + (seq, docid) + } else { + // no existing document found, so we use the one supplied. + self.high_doc_seq += 1; + (self.high_doc_seq, docid) + } + } else { + // no doc id supplied in document, so we create one. + let docid = Uuid::new(UuidVersion::Random) + .unwrap() + .simple() + .to_string(); + try!(shredder.add_id(&docid)); self.high_doc_seq += 1; - self.id_str_to_id_seq.insert(format!("I{}", docid), format!("S{}", self.high_doc_seq)); + (self.high_doc_seq, docid) + }; + // now everything needs to be added to the batch, + try!(shredder.add_all_to_batch(seq, &mut batch.wb)); + batch.id_str_in_batch.insert(docid.clone()); + + Ok(docid) + } + + /// Returns Ok(true) if the document was found and deleted, Ok(false) if it could not be found + pub fn delete(&mut self, docid: &str, mut batch: &mut Batch) -> Result { + if !self.is_open() { + return Err(Error::Write("Index isn't open.".to_string())); + } + if batch.id_str_in_batch.contains(docid) { + // oops use trying to delete a doc that's in the batch. Can't happen, + return Err(Error::Write("Attempt to delete doc with same _id added earlier" + .to_string())); + } + if let Some((seq, key_values)) = try!(self.gather_doc_fields(docid)) { + let mut shredder = Shredder::new(); + try!(shredder.delete_existing_doc(docid, seq, key_values, &mut batch.wb)); + batch.id_str_in_batch.insert(docid.to_string()); + Ok(true) + } else { + Ok(false) + } + } + + fn gather_doc_fields(&self, + docid: &str) + -> Result>)>, Error> { + if let Some(seq) = try!(self.fetch_seq(&docid)) { + // collect up all the fields for the existing doc + let kb = KeyBuilder::new(); + let value_key = kb.value_key(seq); + let mut key_values = BTreeMap::new(); + + let mut iter = self.rocks + .as_ref() + .unwrap() + .iterator(IteratorMode::Start); + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); + loop { + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => break, + }; + + if !key.starts_with(value_key.as_bytes()) { + break; + } + let key = unsafe { str::from_utf8_unchecked(&key) }.to_string(); + let value = value.iter().map(|i| *i).collect(); + key_values.insert(key, value); + } + return Ok(Some((seq, key_values))); + } else { + return Ok(None); } - try!(shredder.add_to_batch(&self.batch)); - Ok(()) } // Store the current batch - pub fn flush(&mut self) -> Result<(), String> { + pub fn flush(&mut self, mut batch: Batch) -> Result<(), Error> { // Flush can only be called if the index is open - // NOTE vmx 2016-10-17: Perhaps that shouldn't panic? - assert!(&self.rocks.is_some()); + if !self.is_open() { + return Err(Error::Write("Index isn't open.".to_string())); + } let rocks = self.rocks.as_ref().unwrap(); - // Look up all doc ids and 'delete' from the seq_to_ids keyspace - for key in self.id_str_to_id_seq.keys() { - // TODO vmx 2016-10-17: USe multiget once the Rusts wrapper supports it - match rocks.get_opt(key.as_bytes(), &self.read_options) { - Ok(Some(seq)) => { - try!(self.batch.delete(&*seq)); - }, - _ => {} - } + let mut bytes = Vec::with_capacity(8 * 2); + bytes + .write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)) + .unwrap(); + bytes + .write(&Index::convert_u64_to_bytes(self.high_doc_seq)) + .unwrap(); + try!(batch.wb.put(b"HDB", &bytes)); + + let status = try!(rocks.write(batch.wb)); + Ok(status) + } + + pub fn all_keys(&self) -> Result, Error> { + if !self.is_open() { + return Err(Error::Write("Index isn't open.".to_string())); + } + let rocks = self.rocks.as_ref().unwrap(); + let mut results = Vec::new(); + for (key, _value) in rocks.iterator(rocksdb::IteratorMode::Start) { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + results.push(key_string); } + Ok(results) + } - // Add the ids_to_seq keyspace entries - for (id, seq) in &self.id_str_to_id_seq { - try!(self.batch.put(id.as_bytes(), seq.as_bytes())); - try!(self.batch.put(seq.as_bytes(), id.as_bytes())); + /// Should not be used generally since it not varint. Used for header fields + /// since only one header is in the database it's not a problem with excess size. + fn convert_bytes_to_u64(bytes: &[u8]) -> u64 { + debug_assert!(bytes.len() == 8); + let mut buffer = [0; 8]; + for (n, b) in bytes.iter().enumerate() { + buffer[n] = *b; } + unsafe { mem::transmute(buffer) } + } - let mut header = Header::new(); - header.high_seq = self.high_doc_seq; - try!(self.batch.put(b"HDB", &*header.serialize())); + /// Should not be used generally since it not varint. Used for header fields + /// since only one header is in the database it's not a problem with excess size. + fn convert_u64_to_bytes(val: u64) -> [u8; 8] { + unsafe { mem::transmute(val) } + } - let status = rocks.write_opt(&self.batch, &self.write_options); - self.batch.clear(); - self.id_str_to_id_seq.clear(); - status + pub fn convert_bytes_to_i32(bytes: &[u8]) -> i32 { + let mut vec = Vec::with_capacity(bytes.len()); + vec.extend(bytes.into_iter()); + let mut read = Cursor::new(vec); + read.read_signed_varint_32().unwrap() } - pub fn fetch_id(&self, seq: u64) -> Result, String> { - // Fetching an ID is only possible if the index is open + pub fn convert_i32_to_bytes(val: i32) -> Vec { + let mut bytes = Cursor::new(Vec::new()); + assert!(bytes.write_signed_varint_32(val).is_ok()); + bytes.into_inner() + } + + pub fn fetch_seq(&self, id: &str) -> Result, Error> { + // Fetching an seq is only possible if the index is open // NOTE vmx 2016-10-17: Perhaps that shouldn't panic? assert!(&self.rocks.is_some()); let rocks = self.rocks.as_ref().unwrap(); - let key = format!("S{}", seq); - match try!(rocks.get_opt(&key.as_bytes(), &self.read_options)) { + let key = format!("I{}", id); + match try!(rocks.get(&key.as_bytes())) { // If there is an id, it's UTF-8 - Some(id) => Ok(Some(id.to_utf8().unwrap().to_string())), - None => Ok(None) + Some(bytes) => Ok(Some(bytes.to_utf8().unwrap().parse().unwrap())), + None => Ok(None), + } + } + + fn compaction_filter(_level: u32, key: &[u8], value: &[u8]) -> CompactionDecision { + if !(key[0] as char == 'C' || key[0] as char == 'K') { + return CompactionDecision::Keep; + } + if 0 == Index::convert_bytes_to_i32(&value) { + CompactionDecision::Remove + } else { + CompactionDecision::Keep + } + } + + fn compare_keys(a: &[u8], b: &[u8]) -> Ordering { + let value_prefixes = ['W', 'f', 'T', 'F', 'N']; + if value_prefixes.contains(&(a[0] as char)) && value_prefixes.contains(&(b[0] as char)) { + let astr = unsafe { str::from_utf8_unchecked(&a) }; + let bstr = unsafe { str::from_utf8_unchecked(&b) }; + KeyBuilder::compare_keys(astr, bstr) + } else { + a.cmp(b) } } + + fn sum_merge(new_key: &[u8], + existing_val: Option<&[u8]>, + operands: &mut MergeOperands) + -> Vec { + if !(new_key[0] as char == 'C' || new_key[0] as char == 'K') { + panic!("unknown key type to merge!"); + } + + let mut count = if let Some(bytes) = existing_val { + Index::convert_bytes_to_i32(&bytes) + } else { + 0 + }; + + for bytes in operands { + count += Index::convert_bytes_to_i32(&bytes); + } + Index::convert_i32_to_bytes(count) + } } #[cfg(test)] mod tests { extern crate rocksdb; - use super::{Index, OpenOptions}; + use super::{Index, OpenOptions, Batch}; + use query::Query; + use std::str; + use snapshot::JsonFetcher; + use json_value::JsonValue; #[test] fn test_open() { + let dbname = "target/tests/firstnoisedb"; + let _ = Index::drop(dbname); + let mut index = Index::new(); //let db = super::Index::open("firstnoisedb", Option::None).unwrap(); - index.open("firstnoisedb", Some(OpenOptions::Create)).unwrap(); - index.flush().unwrap(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + index.flush(Batch::new()).unwrap(); + } + + #[test] + fn test_uuid() { + let dbname = "target/tests/testuuid"; + let _ = Index::drop(dbname); + let mut batch = Batch::new(); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let id = index.add(r#"{"foo":"bar"}"#, &mut batch).unwrap(); + + index.flush(batch).unwrap(); + + let mut results = Query::get_matches(r#"find {foo:=="bar"}"#, &index).unwrap(); + let query_id = results.get_next_id().unwrap(); + assert!(query_id.len() == 32); + assert_eq!(query_id, id); + } + + #[test] + fn test_compaction() { + let dbname = "target/tests/testcompaction"; + let _ = Index::drop(dbname); + let mut batch = Batch::new(); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let id = index.add(r#"{"foo":"bar"}"#, &mut batch).unwrap(); + index.flush(batch).unwrap(); + + let mut batch = Batch::new(); + index.delete(&id, &mut batch).unwrap(); + index.flush(batch).unwrap(); + + let rocks = index.rocks.as_mut().unwrap(); + + // apparently you need to do compaction twice when there are merges + // first one lets the merges happen, the second lets them be collected. + // this is acceptable since eventually the keys go away. + // if this test fails non-deterministically we might have a problem. + rocks.compact_range(None, None); + rocks.compact_range(None, None); + + let mut iter = rocks.iterator(rocksdb::IteratorMode::Start); + let (key, _value) = iter.next().unwrap(); + assert!(key.starts_with(&b"HDB"[..])); + assert!(iter.next().is_none()); + } + + #[test] + fn test_updates() { + let dbname = "target/tests/testupdates"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let mut batch = Batch::new(); + let _ = index + .add(r#"{"_id":"1", "foo":"array", "baz": [1,2,[3,4,[5]]]}"#, + &mut batch) + .unwrap(); + + index.flush(batch).unwrap(); + { + let rocks = index.rocks.as_mut().unwrap(); + + let mut results = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + if key[0] as char == 'V' { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + results.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); + } + } + + let expected = vec![("V1#._id".to_string(), JsonValue::String("1".to_string())), + ("V1#.baz$0".to_string(), JsonValue::Number(1.0)), + ("V1#.baz$1".to_string(), JsonValue::Number(2.0)), + ("V1#.baz$2$0".to_string(), JsonValue::Number(3.0)), + ("V1#.baz$2$1".to_string(), JsonValue::Number(4.0)), + ("V1#.baz$2$2$0".to_string(), JsonValue::Number(5.0)), + ("V1#.foo".to_string(), JsonValue::String("array".to_string()))]; + assert_eq!(results, expected); + } + + let mut batch = Batch::new(); + let _ = index + .add(r#"{"_id":"1", "foo":"array", "baz": []}"#, &mut batch) + .unwrap(); + index.flush(batch).unwrap(); + + let rocks = index.rocks.as_mut().unwrap(); + + let mut results = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + if key[0] as char == 'V' { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + results.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); + } + } + let expected = vec![("V1#._id".to_string(), JsonValue::String("1".to_string())), + ("V1#.baz".to_string(), JsonValue::Array(vec![])), + ("V1#.foo".to_string(), JsonValue::String("array".to_string()))]; + assert_eq!(results, expected); + } + + #[test] + fn test_empty_doc() { + let dbname = "target/tests/testemptydoc"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let mut batch = Batch::new(); + let id = index.add("{}", &mut batch).unwrap(); + + index.flush(batch).unwrap(); + let query = r#"find {_id:==""#.to_string() + &id + "\"} return ."; + let mut results = Query::get_matches(&query, &index).unwrap(); + let json = results.next().unwrap(); + assert_eq!(json, + JsonValue::Object(vec![("_id".to_string(), JsonValue::String(id))])); + } } diff --git a/src/json_shred.rs b/src/json_shred.rs index 1860076..acc65e0 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -1,258 +1,585 @@ extern crate rocksdb; extern crate rustc_serialize; +extern crate varint; -use std::collections::HashMap; +use std::collections::{HashMap, BTreeMap}; +use std::mem::transmute; +use std::io::Write; +use std::str::Chars; +use std::io::Cursor; +use std::str; +use self::varint::VarintWrite; use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; -// Needed for a trait in order to `put()` into a `rocksdb::WriteBatch` -use self::rocksdb::Writable; -use key_builder::{KeyBuilder, SegmentType}; -use records_capnp::payload; +use error::Error; +use key_builder::KeyBuilder; use stems::Stems; +use index::Index; -// Good example of using rustc_serialize: https://github.com/ajroetker/beautician/blob/master/src/lib.rs -// Callback based JSON streaming parser: https://github.com/gyscos/json-streamer.rs -// Another parser pased on rustc_serializ: https://github.com/isagalaev/ijson-rust/blob/master/src/test.rs#L11 +// Good example of using rustc_serialize: +// https://github.com/ajroetker/beautician/blob/master/src/lib.rs +// Callback based JSON streaming parser: +// https://github.com/gyscos/json-streamer.rs +// Another parser pased on rustc_serializ: +// https://github.com/isagalaev/ijson-rust/blob/master/src/test.rs#L11 - -#[derive(Debug, PartialEq)] -struct WordInfo { - //offset in the text field where the stemmed text starts - stemmed_offset: u64, - - // the suffix of the stemmed text. When applied over stemmed, the original - // text is returned. - suffix_text: String, - - // the start of the suffixText - suffix_offset: u64, +enum ObjectKeyTypes { + /// _id field + Id, + /// Normal key + Key(String), + /// No key found + NoKey, } -type ArrayOffsets = Vec; -type ArrayOffsetsToWordInfo = HashMap>; -type WordPathInfoMap = HashMap; - #[derive(Debug)] pub struct Shredder { - keybuilder: KeyBuilder, - map: WordPathInfoMap, - path_array_offsets: ArrayOffsets, + kb: KeyBuilder, + doc_id: Option, + object_keys_indexed: Vec, + shredded_key_values: BTreeMap>, + existing_key_value_to_delete: BTreeMap>, } - impl Shredder { pub fn new() -> Shredder { - Shredder{ - keybuilder: KeyBuilder::new(), - map: WordPathInfoMap::new(), - path_array_offsets: Vec::new(), + Shredder { + kb: KeyBuilder::new(), + doc_id: None, + object_keys_indexed: Vec::new(), + shredded_key_values: BTreeMap::new(), + existing_key_value_to_delete: BTreeMap::new(), + } + } + + fn add_number_entries(kb: &mut KeyBuilder, + number: &[u8], + docseq: u64, + batch: &mut rocksdb::WriteBatch, + delete: bool) + -> Result<(), Error> { + // Add/delete the key that is used for range lookups + let number_key = kb.number_key(docseq); + if delete { + try!(batch.delete(&number_key.as_bytes())); + } else { + // The number contains the `f` prefix + try!(batch.put(&number_key.as_bytes(), &number[1..])); + } + + Ok(()) + } + + fn add_bool_null_entries(kb: &mut KeyBuilder, + prefix: char, + docseq: u64, + batch: &mut rocksdb::WriteBatch, + delete: bool) + -> Result<(), Error> { + let key = kb.bool_null_key(prefix, docseq); + if delete { + try!(batch.delete(&key.as_bytes())); + } else { + // No need to store any value as the key already contains it + try!(batch.put(&key.as_bytes(), &[])); } + + Ok(()) } - fn add_entries(&mut self, text: String, docseq: u64) { - let stems = Stems::new(text.as_str()); + + fn add_stemmed_entries(kb: &mut KeyBuilder, + text: &str, + docseq: u64, + batch: &mut rocksdb::WriteBatch, + delete: bool) + -> Result<(), Error> { + let stems = Stems::new(text); + let mut word_to_word_positions = HashMap::new(); + let mut total_words: i32 = 0; + + let mut one_enc_bytes = Cursor::new(Vec::new()); + let num = if delete { -1 } else { 1 }; + assert!(one_enc_bytes.write_signed_varint_32(num).is_ok()); + for stem in stems { - self.keybuilder.push_word(&stem.stemmed); - self.keybuilder.push_doc_seq(docseq); - let map_path_array_offsets = self.map.entry(self.keybuilder.key()) - .or_insert(ArrayOffsetsToWordInfo::new()); - let map_word_infos = map_path_array_offsets.entry(self.path_array_offsets.clone()) - .or_insert(Vec::new()); - map_word_infos.push(WordInfo{ - stemmed_offset: stem.stemmed_offset as u64, - suffix_text: stem.suffix.to_string(), - suffix_offset: stem.suffix_offset as u64, - }); - self.keybuilder.pop_doc_seq(); - self.keybuilder.pop_word(); + total_words += 1; + let &mut (ref mut word_positions, ref mut count) = + word_to_word_positions + .entry(stem.stemmed) + .or_insert((Cursor::new(Vec::new()), 0)); + if !delete { + assert!(word_positions + .write_unsigned_varint_32(stem.word_pos) + .is_ok()); + } + *count += 1; + } + + for (stemmed, (word_positions, count)) in word_to_word_positions { + let key = kb.stemmed_word_key(&stemmed, docseq); + if delete { + try!(batch.delete(&key.into_bytes())); + } else { + try!(batch.put(&key.into_bytes(), &word_positions.into_inner())); + } + + let key = kb.field_length_key(docseq); + if delete { + try!(batch.delete(&key.into_bytes())); + } else { + try!(batch.put(&key.into_bytes(), &Index::convert_i32_to_bytes(total_words))); + } + + let key = kb.keypathword_count_key(&stemmed); + if delete { + try!(batch.merge(&key.into_bytes(), &Index::convert_i32_to_bytes(-count))); + } else { + try!(batch.merge(&key.into_bytes(), &Index::convert_i32_to_bytes(count))); + } + + let key = kb.keypath_count_key(); + try!(batch.merge(&key.into_bytes(), one_enc_bytes.get_ref())); } - println!("add_entries: map: {:?}", self.map); + + Ok(()) + } + + fn add_value(&mut self, code: char, value: &[u8]) -> Result<(), Error> { + let key = self.kb.value_key_path_only(); + let mut buffer = Vec::with_capacity(value.len() + 1); + buffer.push(code as u8); + try!((&mut buffer as &mut Write).write_all(value)); + self.shredded_key_values.insert(key, buffer); + Ok(()) } - fn inc_top_array_offset(&mut self) { - // we encounter a new element. if we are a child element of an array - // increment the offset. If we aren't (we are the root value or a map - // value) we don't increment - if let Some(SegmentType::Array) = self.keybuilder.last_pushed_segment_type() { - if let Some(last) = self.path_array_offsets.last_mut() { - *last += 1; + fn maybe_add_value(&mut self, + parser: &Parser, + code: char, + value: &[u8]) + -> Result<(), Error> { + match self.extract_key(parser.stack().top()) { + ObjectKeyTypes::Id => { + if code != 's' && self.kb.keypath_segments_len() == 1 { + //nested fields can be _id, not root fields + return Err(Error::Shred("Expected string for `_id` field, got another type" + .to_string())); + } + self.doc_id = Some(unsafe { str::from_utf8_unchecked(value) }.to_string()); + self.kb.pop_object_key(); + self.kb.push_object_key("_id"); + *self.object_keys_indexed.last_mut().unwrap() = true; + try!(self.add_value(code, &value)); + } + ObjectKeyTypes::Key(key) => { + // Pop the dummy object that makes ObjectEnd happy + // or the previous object key + self.kb.pop_object_key(); + self.kb.push_object_key(&key); + *self.object_keys_indexed.last_mut().unwrap() = true; + try!(self.add_value(code, &value)); + } + ObjectKeyTypes::NoKey => { + try!(self.add_value(code, &value)); + self.kb.inc_top_array_offset(); } } + Ok(()) } + // Extract key if it exists and indicates if it's a special type of key + fn extract_key(&mut self, stack_element: Option) -> ObjectKeyTypes { + match stack_element { + Some(StackElement::Key(key)) => { + if self.kb.keypath_segments_len() == 1 && key == "_id" { + ObjectKeyTypes::Id + } else { + ObjectKeyTypes::Key(key.to_string()) + } + } + _ => ObjectKeyTypes::NoKey, + } + } - pub fn shred(&mut self, json: &str, docseq: u64) -> Result<&str, String> { - println!("{}", json); - let mut parser = Parser::new(json.chars()); - let mut token = parser.next(); + // If we are inside an object we need to push the key to the key builder + // Don't push them if they are reserved fields (starting with underscore) + fn maybe_push_key(&mut self, stack_element: Option) -> Result<(), Error> { + if let Some(StackElement::Key(key)) = stack_element { + if self.kb.keypath_segments_len() == 1 && key == "_id" { + return Err(Error::Shred("Expected string for `_id` field, got another type" + .to_string())); + } else { + // Pop the dummy object that makes ObjectEnd happy + // or the previous object key + self.kb.pop_object_key(); + self.kb.push_object_key(key); + *self.object_keys_indexed.last_mut().unwrap() = true; + } + } + Ok(()) + } + + pub fn add_all_to_batch(&mut self, + seq: u64, + batch: &mut rocksdb::WriteBatch) + -> Result<(), Error> { + for (key, value) in &self.existing_key_value_to_delete { + self.kb.clear(); + self.kb + .parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); + match value[0] as char { + 's' => { + let text = unsafe { str::from_utf8_unchecked(&value[1..]) }; + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); + } + 'f' => { + try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); + } + 'T' | 'F' | 'N' => { + try!(Shredder::add_bool_null_entries(&mut self.kb, + value[0] as char, + seq, + batch, + true)); + } + _ => {} + } + try!(batch.delete(&key.as_bytes())); + } + self.existing_key_value_to_delete = BTreeMap::new(); + + for (key, value) in &self.shredded_key_values { + self.kb.clear(); + self.kb.parse_value_key_path_only(&key); + match value[0] as char { + 's' => { + let text = unsafe { str::from_utf8_unchecked(&value[1..]) }; + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, false)); + } + 'f' => { + try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, false)); + } + 'T' | 'F' | 'N' => { + try!(Shredder::add_bool_null_entries(&mut self.kb, + value[0] as char, + seq, + batch, + false)); + } + _ => {} + } + let key = self.kb.value_key(seq); + try!(batch.put(&key.as_bytes(), &value.as_ref())); + } + self.shredded_key_values = BTreeMap::new(); + + let key = KeyBuilder::id_to_seq_key(self.doc_id.as_ref().unwrap()); + try!(batch.put(&key.into_bytes(), &seq.to_string().as_bytes())); + + let key = KeyBuilder::seq_key(seq); + try!(batch.put(&key.into_bytes(), b"")); + + Ok(()) + } + + pub fn delete_existing_doc(&mut self, + docid: &str, + seq: u64, + existing: BTreeMap>, + batch: &mut rocksdb::WriteBatch) + -> Result<(), Error> { + self.doc_id = Some(docid.to_string()); + for (key, value) in existing.into_iter() { + self.kb.clear(); + self.kb + .parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); + match value[0] as char { + 's' => { + let text = unsafe { str::from_utf8_unchecked(&value[1..]) }; + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); + } + 'f' => { + try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); + } + 'T' | 'F' | 'N' => { + try!(Shredder::add_bool_null_entries(&mut self.kb, + value[0] as char, + seq, + batch, + true)); + } + _ => {} + } + try!(batch.delete(&key.as_bytes())); + } + let key = KeyBuilder::id_to_seq_key(self.doc_id.as_ref().unwrap()); + try!(batch.delete(&key.into_bytes())); + + let key = KeyBuilder::seq_key(seq); + try!(batch.delete(&key.into_bytes())); + Ok(()) + } + pub fn merge_existing_doc(&mut self, existing: BTreeMap>) { + // we found doc with the same id already stored on disk. We need to delete + // the doc. But any fields that are the same we can just keep around + // and don't even need to reindex. + for (existing_key, existing_value) in existing { + let matches = { + let key = KeyBuilder::value_key_path_only_from_str(&existing_key); + if let Some(new_value) = self.shredded_key_values.get(key) { + *new_value == existing_value + } else { + false + } + }; + if matches { + // we don't need to write or index these values, they already exist! + let key = KeyBuilder::value_key_path_only_from_str(&existing_key); + self.shredded_key_values.remove(key).unwrap(); + } else { + // we need to delete these keys and the index keys assocaited with the valuess + self.existing_key_value_to_delete + .insert(existing_key, existing_value); + } + } + } + + pub fn add_id(&mut self, id: &str) -> Result<(), Error> { + self.doc_id = Some(id.to_string()); + self.kb.clear(); + self.kb.push_object_key("_id"); + try!(self.add_value('s', &id.as_bytes())); + Ok(()) + } + + pub fn shred(&mut self, json: &str) -> Result, Error> { + let mut parser = Parser::new(json.chars()); loop { // Get the next token, so that in case of an `ObjectStart` the key is already // on the stack. - let nexttoken = parser.next(); - - match token.take() { + match parser.next().take() { Some(JsonEvent::ObjectStart) => { - match parser.stack().top() { - Some(StackElement::Key(key)) => { - println!("object start: {:?}", key); - self.keybuilder.push_object_key(key.to_string()); - self.inc_top_array_offset(); - }, - _ => { - panic!("XXX This is probably an object end"); - } - } - }, + try!(self.maybe_push_key(parser.stack().top())); + // Just push something to make `ObjectEnd` happy + self.kb.push_object_key(""); + self.object_keys_indexed.push(false); + } Some(JsonEvent::ObjectEnd) => { - self.keybuilder.pop_object_key(); - }, + self.kb.pop_object_key(); + if self.kb.keypath_segments_len() > 0 && + !self.object_keys_indexed.pop().unwrap() { + // this means we never wrote a key because the object was empty. + // So preserve the empty object by writing a special value. + // but not for the root object. it will always have _id field added. + try!(self.maybe_add_value(&parser, 'o', &[])); + } + self.kb.inc_top_array_offset(); + } Some(JsonEvent::ArrayStart) => { - println!("array start"); - self.keybuilder.push_array(); - //self.inc_top_array_offset(); - self.path_array_offsets.push(0); - }, + try!(self.maybe_push_key(parser.stack().top())); + self.kb.push_array(); + } Some(JsonEvent::ArrayEnd) => { - self.path_array_offsets.pop(); - self.keybuilder.pop_array(); - }, + if self.kb.peek_array_offset() == 0 { + // this means we never wrote a value because the object was empty. + // So preserve the empty array by writing a special value. + self.kb.pop_array(); + try!(self.maybe_add_value(&parser, 'a', &[])); + } else { + self.kb.pop_array(); + } + self.kb.inc_top_array_offset(); + } Some(JsonEvent::StringValue(value)) => { - self.add_entries(value, docseq); - self.inc_top_array_offset(); - //self.keybuilder.pop_object_key(); - }, - not_implemented => { - panic!("Not yet implemented other JSON types! {:?}", not_implemented); + try!(self.maybe_add_value(&parser, 's', &value.as_bytes())); + } + Some(JsonEvent::BooleanValue(tf)) => { + let code = if tf { 'T' } else { 'F' }; + try!(self.maybe_add_value(&parser, code, &[])); + } + Some(JsonEvent::I64Value(i)) => { + let f = i as f64; + let bytes = unsafe { transmute::(f) }; + try!(self.maybe_add_value(&parser, 'f', &bytes[..])); + } + Some(JsonEvent::U64Value(u)) => { + let f = u as f64; + let bytes = unsafe { transmute::(f) }; + try!(self.maybe_add_value(&parser, 'f', &bytes[..])); + } + Some(JsonEvent::F64Value(f)) => { + let bytes = unsafe { transmute::(f) }; + try!(self.maybe_add_value(&parser, 'f', &bytes[..])); + } + Some(JsonEvent::NullValue) => { + try!(self.maybe_add_value(&parser, 'N', &[])); + } + Some(JsonEvent::Error(error)) => { + return Err(Error::Shred(error.to_string())); + } + None => { + break; } }; - - token = nexttoken; - if token == None { - break; - } } - println!("keybuilder: {}", self.keybuilder.key()); - println!("shredder: keys:"); - for key in self.map.keys() { - println!(" {}", key); - } - Ok(&"thedocid") + Ok(self.doc_id.clone()) } +} - pub fn add_to_batch(&self, batch: &rocksdb::WriteBatch) -> Result<(), String> { - for (key_path, word_path_infos) in &self.map { - let mut message = ::capnp::message::Builder::new_default(); - { - let capn_payload = message.init_root::(); - let mut capn_arrayoffsets_to_wordinfo = capn_payload.init_arrayoffsets_to_wordinfos( - word_path_infos.len() as u32); - for (infos_pos, (arrayoffsets, wordinfos)) in word_path_infos.iter().enumerate() { - - let mut capn_a2w = capn_arrayoffsets_to_wordinfo.borrow().get(infos_pos as u32); - { - let mut capn_arrayoffsets = - capn_a2w.borrow().init_arrayoffsets(arrayoffsets.len() as u32); - for (pos, arrayoffset) in arrayoffsets.iter().enumerate() { - capn_arrayoffsets.set(pos as u32, arrayoffset.clone()); - } - } - { - let mut capn_wordinfos = capn_a2w.init_wordinfos(wordinfos.len() as u32); - for (pos, wordinfo) in wordinfos.iter().enumerate() { - let mut capn_wordinfo = capn_wordinfos.borrow().get(pos as u32); - capn_wordinfo.set_stemmed_offset(wordinfo.stemmed_offset); - capn_wordinfo.set_suffix_text(&wordinfo.suffix_text); - capn_wordinfo.set_suffix_offset(wordinfo.suffix_offset); - } - } +#[cfg(test)] +mod tests { + extern crate rocksdb; + extern crate varint; + + use self::varint::VarintRead; + + use std::io::Cursor; + use std::str; + + use index::{Index, OpenOptions}; + use json_value::JsonValue; + use snapshot::JsonFetcher; + + fn positions_from_rocks(rocks: &rocksdb::DB) -> Vec<(String, Vec)> { + let mut result = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + if key[0] as char == 'W' { + let mut vec = Vec::with_capacity(value.len()); + vec.extend(value.into_iter()); + let mut bytes = Cursor::new(vec); + let mut positions = Vec::new(); + while let Ok(pos) = bytes.read_unsigned_varint_32() { + positions.push(pos); } + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + result.push((key_string, positions)); } - let mut bytes = Vec::new(); - ::capnp::serialize_packed::write_message(&mut bytes, &message).unwrap(); - try!(batch.put(&key_path.clone().into_bytes(), &bytes)); } - Ok(()) + result } -} + fn values_from_rocks(rocks: &rocksdb::DB) -> Vec<(String, JsonValue)> { + let mut result = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + if key[0] as char == 'V' { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + result.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); + } + } + result + } -#[cfg(test)] -mod tests { - use super::{ArrayOffsetsToWordInfo, WordInfo, WordPathInfoMap}; #[test] fn test_shred_nested() { let mut shredder = super::Shredder::new(); - //let json = r#"{"hello": {"my": "world!"}, "anumber": 2}"#; - //let json = r#"{"A":[{"B":"B2VMX two three","C":"C2"},{"B": "b1","C":"C2"}]}"#; - //let json = r#"{"A":[[[{"B": "string within deeply nested array should be stemmed"}]]]}"#; - //let json = r#"[{"A": 1, "B": 2, "C": 3}]"#; - //let json = r#"{"foo": {"bar": 1}}"#; let json = r#"{"some": ["array", "data", ["also", "nested"]]}"#; let docseq = 123; - shredder.shred(json, docseq).unwrap(); - let expected = vec![ - ("W.some$!array#123", vec![ - (vec![0], vec![WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 5 }])]), - ("W.some$!data#123", vec![ - (vec![1], vec![WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }])]), - ("W.some$$!also#123", vec![ - (vec![2, 0], vec![WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }])]), - ("W.some$$!nest#123", vec![ - (vec![2, 1], vec![WordInfo { - stemmed_offset: 0, suffix_text: "ed".to_string(), suffix_offset: 4 }])]), - ]; - compare_shredded(&shredder.map, &expected); + let mut batch = rocksdb::WriteBatch::default(); + shredder.shred(json).unwrap(); + shredder.add_id("foo").unwrap(); + shredder.add_all_to_batch(docseq, &mut batch).unwrap(); + + let dbname = "target/tests/test_shred_nested"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + + rocks.write(batch).unwrap(); + let result = positions_from_rocks(&rocks); + + let expected = vec![("W._id!foo#123,".to_string(), vec![0]), + ("W.some$!array#123,0".to_string(), vec![0]), + ("W.some$!data#123,1".to_string(), vec![0]), + ("W.some$$!also#123,2,0".to_string(), vec![0]), + ("W.some$$!nest#123,2,1".to_string(), vec![0])]; + assert_eq!(result, expected); } #[test] + fn test_shred_double_nested() { + let mut shredder = super::Shredder::new(); + let json = r#"{"a":{"a":"b"}}"#; + let docseq = 123; + let mut batch = rocksdb::WriteBatch::default(); + shredder.shred(json).unwrap(); + shredder.add_id("foo").unwrap(); + shredder.add_all_to_batch(docseq, &mut batch).unwrap(); + + let dbname = "target/tests/test_shred_double_nested"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + + rocks.write(batch).unwrap(); + let result = values_from_rocks(&rocks); + + let expected = vec![("V123#._id".to_string(), JsonValue::String("foo".to_string())), + ("V123#.a.a".to_string(), JsonValue::String("b".to_string()))]; + assert_eq!(result, expected); + } + + + #[test] + // NOTE vmx 2016-12-06: This test is intentionally made to fail (hence ignored) as the current + // current tokenizer does the wrong thing when it comes to numbers within words. It's left + // here as a reminder to fix that + #[ignore] fn test_shred_objects() { let mut shredder = super::Shredder::new(); let json = r#"{"A":[{"B":"B2VMX two three","C":"..C2"},{"B": "b1","C":"..C2"}]}"#; let docseq = 1234; - shredder.shred(json, docseq).unwrap(); - let expected = vec![ - ("W.A$.B!b1#1234", vec![ - (vec![0], vec![ - WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }])]), - ("W.A$.B!b2vmx#1234", vec![ - (vec![0], vec![ - WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 5 }])]), - ("W.A$.B!c2#1234", vec![ - (vec![0], vec![ - WordInfo { - stemmed_offset: 2, suffix_text: "".to_string(), suffix_offset: 4 }, - WordInfo { - stemmed_offset: 2, suffix_text: "".to_string(), suffix_offset: 4 }])]), - ("W.A$.B!three#1234", vec![ - (vec![0], vec![WordInfo { - stemmed_offset: 10, suffix_text: "".to_string(), suffix_offset: 15 }])]), - ("W.A$.B!two#1234", vec![ - (vec![0], vec![WordInfo { - stemmed_offset: 6, suffix_text: "".to_string(), suffix_offset: 9 }])]), - ]; - compare_shredded(&shredder.map, &expected); + let mut batch = rocksdb::WriteBatch::default(); + shredder.shred(json).unwrap(); + shredder.add_all_to_batch(docseq, &mut batch).unwrap(); + + let dbname = "target/tests/test_shred_objects"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + + rocks.write(batch).unwrap(); + let result = positions_from_rocks(&rocks); + let expected = vec![("W.A$.B!b1#1234,1".to_string(), vec![0]), + ("W.A$.B!b2vmx#1234,0".to_string(), vec![0]), + ("W.A$.B!three#1234,0".to_string(), vec![10]), + ("W.A$.B!two#1234,0".to_string(), vec![6]), + ("W.A$.C!..#1234,0".to_string(), vec![0]), + ("W.A$.C!..#1234,1".to_string(), vec![0]), + ("W.A$.C!c2#1234,0".to_string(), vec![2]), + ("W.A$.C!c2#1234,1".to_string(), vec![2])]; + assert_eq!(result, expected); } - fn compare_shredded(result_map: &WordPathInfoMap, - expected: &Vec<(&str, Vec<(Vec, Vec)>)>) { - // HashMap have an arbitraty order of the elements - let mut result: Vec<(&String, &ArrayOffsetsToWordInfo)> = result_map.into_iter().collect(); - result.sort_by(|a, b| Ord::cmp(&a.0, &b.0)); - for (ii, &(key, values)) in result.iter().enumerate() { - assert_eq!(key, expected[ii].0); - let mut wordinfos: Vec<(&Vec, &Vec)> = values.iter().collect(); - wordinfos.sort_by_key(|item| item.0); - for (jj, wordinfo) in wordinfos.iter().enumerate() { - assert_eq!(wordinfo.0, &expected[ii].1[jj].0); - assert_eq!(wordinfo.1, &expected[ii].1[jj].1); - } - } + #[test] + fn test_shred_empty_object() { + let mut shredder = super::Shredder::new(); + let json = r#"{}"#; + let docseq = 123; + let mut batch = rocksdb::WriteBatch::default(); + shredder.shred(json).unwrap(); + shredder.add_id("foo").unwrap(); + shredder.add_all_to_batch(docseq, &mut batch).unwrap(); + + let dbname = "target/tests/test_shred_empty_object"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let rocks = &index.rocks.unwrap(); + + rocks.write(batch).unwrap(); + let result = positions_from_rocks(&rocks); + let expected = vec![("W._id!foo#123,".to_string(), vec![0])]; + assert_eq!(result, expected); } } diff --git a/src/json_value.rs b/src/json_value.rs new file mode 100644 index 0000000..5e4a225 --- /dev/null +++ b/src/json_value.rs @@ -0,0 +1,271 @@ + +use std::str; +use std::cmp::Ordering; +use std::io::Write; + +use error::Error; + + + +#[derive(PartialEq, PartialOrd, Clone, Debug)] +pub enum JsonValue { + Number(f64), + String(String), + Array(Vec), + Object(Vec<(String, JsonValue)>), + True, + False, + Null, +} + +impl JsonValue { + pub fn str_to_literal(string: &str) -> String { + let mut ret = String::with_capacity(string.len() * 2 + 2); + ret.push('"'); + for c in string.chars() { + if c == '"' || c == '\\' { + ret.push('\\'); + } + ret.push(c); + } + ret.push('"'); + ret + } + + fn cmp_always_equal(_a: &JsonValue, _b: &JsonValue) -> Ordering { + Ordering::Equal + } + + fn cmp_f64(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Number(a_val) = a { + if let &JsonValue::Number(b_val) = b { + if a_val < b_val { + Ordering::Less + } else if a_val > b_val { + Ordering::Greater + } else { + Ordering::Equal + } + } else { + panic!("cast error in cmp_f64"); + } + } else { + panic!("cast error in cmp_f64"); + } + } + + fn cmp_string(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::String(ref a_val) = a { + if let &JsonValue::String(ref b_val) = b { + // Note we eventually want to switch to a collation library like ICU + a_val.cmp(&b_val) + } else { + panic!("cast error in cmp_string"); + } + } else { + panic!("cast error in cmp_string"); + } + } + + fn cmp_array(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Array(ref a_val) = a { + if let &JsonValue::Array(ref b_val) = b { + for (a_el, b_el) in a_val.iter().zip(b_val.iter()) { + let order = a_el.cmp(&b_el); + if order != Ordering::Equal { + return order; + } + } + // if we got here all elements were equal. But one array might be longer + // so sort it last + a_val.len().cmp(&b_val.len()) + } else { + panic!("cast error in cmp_array"); + } + } else { + panic!("cast error in cmp_array"); + } + } + + fn cmp_object(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Object(ref a_val) = a { + if let &JsonValue::Object(ref b_val) = b { + for (a_el, b_el) in a_val.iter().zip(b_val.iter()) { + // compare key + let mut order = a_el.0.cmp(&b_el.0); + if order != Ordering::Equal { + return order; + } + // compare value + order = a_el.1.cmp(&b_el.1); + if order != Ordering::Equal { + return order; + } + } + // if we got here all elements were equal. But one object might be longer + // so sort it last + a_val.len().cmp(&b_val.len()) + } else { + panic!("cast error in cmp_object"); + } + } else { + panic!("cast error in cmp_object"); + } + } + + fn type_sort_order(&self) -> (usize, fn(&JsonValue, &JsonValue) -> Ordering) { + match self { + &JsonValue::Null => (0, JsonValue::cmp_always_equal), + &JsonValue::False => (1, JsonValue::cmp_always_equal), + &JsonValue::True => (2, JsonValue::cmp_always_equal), + &JsonValue::Number(_) => (3, JsonValue::cmp_f64), + &JsonValue::String(_) => (4, JsonValue::cmp_string), + &JsonValue::Array(_) => (5, JsonValue::cmp_array), + &JsonValue::Object(_) => (6, JsonValue::cmp_object), + } + } + + pub fn render(&self, write: &mut Write, pretty: &mut PrettyPrint) -> Result<(), Error> { + match self { + &JsonValue::Number(ref num) => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all(num.to_string().as_bytes())); + } + &JsonValue::String(ref string) => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all(JsonValue::str_to_literal(&string).as_bytes())) + } + &JsonValue::Array(ref array) => { + if array.is_empty() { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("[]".as_bytes())); + return Ok(()); + } + try!(write.write_all(pretty.prefix())); + try!(write.write_all("[".as_bytes())); + try!(write.write_all(pretty.newline())); + pretty.push(); + + let mut iter = array.iter().peekable(); + loop { + match iter.next() { + Some(json) => try!(json.render(write, pretty)), + None => break, + } + if iter.peek().is_some() { + try!(write.write_all(",".as_bytes())); + } + try!(write.write_all(pretty.newline())); + } + pretty.pop(); + try!(write.write_all(pretty.prefix())); + try!(write.write_all("]".as_bytes())); + } + &JsonValue::Object(ref object) => { + if object.is_empty() { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("{}".as_bytes())); + return Ok(()); + } + try!(write.write_all(pretty.prefix())); + try!(write.write_all("{".as_bytes())); + try!(write.write_all(pretty.newline())); + pretty.push(); + + let mut iter = object.iter().peekable(); + loop { + match iter.next() { + Some(&(ref key, ref json)) => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all(JsonValue::str_to_literal(&key).as_bytes())); + try!(write.write_all(":".as_bytes())); + pretty.next_prefix_is_space(); + try!(json.render(write, pretty)); + } + None => break, + } + if iter.peek().is_some() { + try!(write.write_all(",".as_bytes())); + } + try!(write.write_all(pretty.newline())); + } + pretty.pop(); + try!(write.write_all(pretty.prefix())); + try!(write.write_all("}".as_bytes())); + } + &JsonValue::True => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("true".as_bytes())); + } + &JsonValue::False => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("false".as_bytes())); + } + &JsonValue::Null => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("null".as_bytes())) + } + } + Ok(()) + } +} + +impl Eq for JsonValue {} + +impl Ord for JsonValue { + fn cmp(&self, other: &JsonValue) -> Ordering { + let (self_order_num, self_cmp_fun) = self.type_sort_order(); + let (other_order_num, _other_cmp_fun) = other.type_sort_order(); + match self_order_num.cmp(&other_order_num) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => self_cmp_fun(self, other), + } + } +} + +pub struct PrettyPrint { + indention: String, + newline: String, + spacing: String, + buffer: String, + next_prefix_is_space: bool, +} + +impl PrettyPrint { + pub fn new(indention: &str, newline: &str, spacing: &str) -> PrettyPrint { + PrettyPrint { + indention: indention.to_string(), + newline: newline.to_string(), + spacing: spacing.to_string(), + buffer: String::new(), + next_prefix_is_space: false, + } + } + + pub fn push(&mut self) { + self.buffer += &self.indention; + } + + pub fn pop(&mut self) { + let len = self.buffer.len() - self.indention.len(); + self.buffer.truncate(len); + } + + pub fn next_prefix_is_space(&mut self) { + self.next_prefix_is_space = true; + } + + pub fn prefix(&mut self) -> &[u8] { + if self.next_prefix_is_space { + self.next_prefix_is_space = false; + self.spacing.as_bytes() + } else { + self.buffer.as_bytes() + } + } + + pub fn newline(&mut self) -> &[u8] { + self.newline.as_bytes() + } +} diff --git a/src/key_builder.rs b/src/key_builder.rs index e3bb184..b5fa3ee 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -1,229 +1,525 @@ -//#[derive(PartialEq, Eq)] -#[derive(Debug, Clone, PartialEq)] -pub enum SegmentType { - // BuildState is really simple state tracker to prevent misuse of api - ObjectKey, - Array, - Word, - DocSeq, -} +extern crate unicode_normalization; -#[derive(Debug, Clone)] -struct Segment { - type_: SegmentType, - offset: usize, +use query::DocResult; +use std::str; +use std::cmp::Ordering; + +pub enum Segment { + ObjectKey(String), + Array(u64), } #[derive(Debug, Clone)] pub struct KeyBuilder { - pub array_depth: usize, - segments: Vec, - fullkey: String, + keypath: Vec, + arraypath: Vec, } - impl KeyBuilder { pub fn new() -> KeyBuilder { - let mut kb = KeyBuilder{ - array_depth: 0, - // Magic reserve numbers that are completely arbitrary - segments: Vec::with_capacity(10), - fullkey: String::with_capacity(100), - }; - // First char is keyspace identifier. W means Word keyspace - kb.fullkey.push('W'); - return kb; - } - - // NOTE vmx 2016-10-28: This one is just a port of the C++ prototype, but not yet needed here - //fn segments_count(&self) -> usize { - // self.segments.len() - //} - - pub fn key(&self) -> String { - self.fullkey.clone() - } - - pub fn push_object_key(&mut self, key: String) { - debug_assert!(self.segments.len() == 0 || - self.segments.last().unwrap().type_ == SegmentType::ObjectKey || - self.segments.last().unwrap().type_ == SegmentType::Array); - self.segments.push(Segment{ type_: SegmentType::ObjectKey, offset: self.fullkey.len() }); - self.fullkey.push('.'); + KeyBuilder { + // Magic reserve number is completely arbitrary + keypath: Vec::with_capacity(10), + arraypath: Vec::with_capacity(10), + } + } + + pub fn clear(&mut self) { + self.keypath.clear(); + self.arraypath.clear(); + } + + pub fn get_keypathword_only(&self, word: &str) -> String { + let mut string = String::with_capacity(100); + string.push('W'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('!'); + string.push_str(word); + string.push('#'); + string + } + + pub fn keypathword_count_key(&self, word: &str) -> String { + let mut string = String::with_capacity(100); + string.push('C'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('!'); + string.push_str(word); + string + } + + pub fn keypath_count_key(&self) -> String { + let mut string = String::with_capacity(100); + string.push('K'); + for segment in &self.keypath { + string.push_str(&segment); + } + string + } + + pub fn id_to_seq_key(id: &str) -> String { + let mut str = String::with_capacity(id.len() + 1); + str.push('I'); + str.push_str(&id); + str + } + + pub fn seq_key(seq: u64) -> String { + let seq = seq.to_string(); + let mut str = String::with_capacity(seq.len() + 1); + str.push('S'); + str.push_str(&seq); + str + } + + pub fn parse_seq_key(key: &str) -> Option { + if key.starts_with("S") { + Some(key[1..].parse().unwrap()) + } else { + None + } + } + + /// Build the index key that corresponds to a number primitive + pub fn number_key(&self, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push('f'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('#'); + string.push_str(&seq.to_string()); + + KeyBuilder::add_arraypath(&mut string, &self.arraypath); + string + } + + /// Build the index key that corresponds to a true, false or nulla primitive + pub fn bool_null_key(&self, prefix: char, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push(prefix); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('#'); + string.push_str(&seq.to_string()); + + KeyBuilder::add_arraypath(&mut string, &self.arraypath); + string + } + + + /// Builds a stemmed word key for the input word and seq, using the key_path and arraypath + /// built up internally. + pub fn stemmed_word_key(&self, word: &str, seq: u64) -> String { + let mut string = self.get_keypathword_only(&word); + string.push_str(seq.to_string().as_str()); + + KeyBuilder::add_arraypath(&mut string, &self.arraypath); + string + } + + /// Builds a field length key for the seq, using the key_path and arraypath + /// built up internally. + pub fn field_length_key(&self, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push('L'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('#'); + string.push_str(seq.to_string().as_str()); + + KeyBuilder::add_arraypath(&mut string, &self.arraypath); + string + } + + /// Builds a field length key for the DocResult, using the key_path + /// built up internally and the arraypath from the DocResult. + pub fn field_length_key_from_doc_result(&self, dr: &DocResult) -> String { + let mut string = String::with_capacity(100); + string.push('L'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('#'); + string.push_str(dr.seq.to_string().as_str()); + + KeyBuilder::add_arraypath(&mut string, &dr.arraypath); + string + } + + /// Adds DocResult seq and array path an already created keypathword. + pub fn add_doc_result_to_keypathword(keypathword: &mut String, dr: &DocResult) { + keypathword.push_str(dr.seq.to_string().as_str()); + KeyBuilder::add_arraypath(keypathword, &dr.arraypath); + } + + // NOTE vmx 2017-04-13: I find `keypathword` not really descriptive. I would call the + // path without the Internal Id simply "keypath" and the one with and Internal Id + // "keypath_iid". + /// Truncates key to keypath only + pub fn truncate_to_keypathword(stemmed_word_key: &mut String) { + let n = stemmed_word_key.rfind("#").unwrap(); + stemmed_word_key.truncate(n + 1); + } + + + /// Builds a value key for seq (value keys are the original json terminal value with + /// keyed on keypath and arraypath built up internally). + pub fn value_key(&self, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push('V'); + string.push_str(&seq.to_string()); + string.push('#'); + let mut i = 0; + for segment in &self.keypath { + string.push_str(&segment); + if segment == "$" { + string.push_str(&self.arraypath[i].to_string()); + i += 1; + } + } + string + } + + /// Returns a value key without the doc seq prepended. + pub fn value_key_path_only(&self) -> String { + let mut string = String::with_capacity(100); + let mut i = 0; + for segment in &self.keypath { + string.push_str(&segment); + if segment == "$" { + string.push_str(&self.arraypath[i].to_string()); + i += 1; + } + } + string + } + + /// Returns a value key without the doc seq prepended. + pub fn value_key_path_only_from_str(str: &str) -> &str { + &str[str.find('#').unwrap() + 1..] + } + + /// parses a value_key_path_only and sets the internally elements appropriately + pub fn parse_value_key_path_only(&mut self, mut str: &str) { + while let Some(tuple) = KeyBuilder::parse_first_key_value_segment(str) { + match tuple { + (Segment::ObjectKey(_key), unescaped) => { + str = &str[unescaped.len()..]; + self.keypath.push(unescaped); + } + (Segment::Array(i), unescaped) => { + str = &str[unescaped.len()..]; + self.keypath.push("$".to_string()); + self.arraypath.push(i); + } + } + } + } + + pub fn value_key_from_doc_result(&self, dr: &DocResult) -> String { + let mut string = String::with_capacity(100); + string.push('V'); + string.push_str(&dr.seq.to_string()); + string.push('#'); + let mut i = 0; + for segment in &self.keypath { + string.push_str(&segment); + if segment == "$" { + string.push_str(&dr.arraypath[i].to_string()); + i += 1; + } + } + string + } + + fn add_arraypath(string: &mut String, arraypath: &Vec) { + if arraypath.is_empty() { + string.push(','); + } else { + for i in arraypath { + string.push(','); + string.push_str(i.to_string().as_str()); + } + } + } + + // Returns true if the prefix str is a prefix of the true keypath + pub fn is_keypath_prefix(prefix: &str, keypath: &str) -> bool { + if keypath.starts_with(prefix) { + match keypath[prefix.len()..].chars().next() { + Some('.') => true, + Some('$') => true, + Some(_) => false, + None => true, + } + } else { + false + } + } + + // returns the unescaped segment as Segment and the escaped segment as a slice + pub fn parse_first_key_value_segment(keypath: &str) -> Option<(Segment, String)> { + + let mut unescaped = String::with_capacity(50); + let mut len_bytes = 1; + let mut chars = keypath.chars(); + + // first char must be a . or a $ or we've exceeded the keypath + match chars.next() { + Some('.') => { + loop { + match chars.next() { + Some('\\') => { + if let Some(c) = chars.next() { + len_bytes += c.len_utf8(); + unescaped.push(c); + } else { + panic!("Escape char found as last char in keypath"); + } + } + Some('.') | Some('$') => { + break; + } + Some(c) => { + len_bytes += c.len_utf8(); + unescaped.push(c); + } + None => { + break; + } + } + } + Some((Segment::ObjectKey(unescaped), keypath[..len_bytes].to_string())) + } + Some('$') => { + let mut i = String::new(); + for c in chars { + if c >= '0' && c <= '9' { + i.push(c); + } else { + break; + } + } + Some((Segment::Array(i.parse().unwrap()), keypath[..1 + i.len()].to_string())) + } + Some(_) => None, // we must be past the keypath portion of string. done. + None => None, + } + } + + pub fn push_object_key(&mut self, key: &str) { + let mut escaped_key = String::with_capacity((key.len() * 2) + 1); // max expansion + escaped_key.push('.'); + for cc in key.chars() { // Escape chars that conflict with delimiters if "\\$.!#".contains(cc) { - self.fullkey.push('\\'); + escaped_key.push('\\'); } - self.fullkey.push(cc); + escaped_key.push(cc); } + self.keypath.push(escaped_key); } pub fn push_array(&mut self) { - debug_assert!(self.segments.len() == 0 || - self.segments.last().unwrap().type_ == SegmentType::ObjectKey || - self.segments.last().unwrap().type_ == SegmentType::Array); - self.segments.push(Segment{ type_: SegmentType::Array, offset: self.fullkey.len() }); - self.fullkey.push('$'); - self.array_depth += 1; + self.keypath.push("$".to_string()); + self.arraypath.push(0); } - pub fn push_word(&mut self, stemmed_word: &str) { - debug_assert!(self.segments.len() > 0); - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::ObjectKey || - self.segments.last().unwrap().type_ == SegmentType::Array); - self.segments.push(Segment{ type_: SegmentType::Word, offset: self.fullkey.len() }); - self.fullkey.push('!'); - self.fullkey += stemmed_word; - self.fullkey.push('#'); + pub fn push_array_index(&mut self, index: u64) { + self.keypath.push("$".to_string()); + self.arraypath.push(index); } - pub fn push_doc_seq(&mut self, seq: u64) { - debug_assert!(self.segments.len() > 0); - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::Word); - self.segments.push(Segment{ type_: SegmentType::DocSeq, offset: self.fullkey.len() }); - self.fullkey.push_str(seq.to_string().as_str()); + pub fn pop_object_key(&mut self) { + debug_assert!(self.keypath.last().unwrap().starts_with(".")); + self.keypath.pop(); } - pub fn pop_object_key(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::ObjectKey); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.segments.pop(); + pub fn peek_array_offset(&self) -> u64 { + debug_assert!(self.keypath.last().unwrap().starts_with("$")); + self.arraypath.last().unwrap().clone() } pub fn pop_array(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::Array); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.array_depth -= 1; - self.segments.pop(); + debug_assert!(self.keypath.last().unwrap() == "$"); + self.arraypath.pop(); + self.keypath.pop(); } - pub fn pop_word(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::Word); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.segments.pop(); + pub fn inc_top_array_offset(&mut self) { + if self.keypath.len() > 0 && self.keypath.last().unwrap() == "$" { + *self.arraypath.last_mut().unwrap() += 1; + } } - pub fn pop_doc_seq(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::DocSeq); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.segments.pop(); + pub fn arraypath_len(&self) -> usize { + self.arraypath.len() } - pub fn last_pushed_segment_type(&self) -> Option { - self.segments.last().and_then(|segment| Some(segment.type_.clone())) + pub fn keypath_segments_len(&self) -> usize { + self.keypath.len() + } + + /// splits key into key path, seq and array path + /// ex "W.foo$.bar$.baz!word#123,0,0" -> ("W.foo$.bar$.bar!word", "123", "0,0") + fn split_keypath_seq_arraypath_from_key(str: &str) -> (&str, &str, &str) { + let n = str.rfind("#").unwrap(); + assert!(n != 0); + assert!(n != str.len() - 1); + let seq_arraypath_str = &str[(n + 1)..]; + let m = seq_arraypath_str.find(",").unwrap(); + + (&str[..n], &seq_arraypath_str[..m], &seq_arraypath_str[m + 1..]) + } + + /// parses a seq and array path portion (ex "123,0,0,10) of a key into a doc result + pub fn parse_doc_result_from_key(str: &str) -> DocResult { + let mut dr = DocResult::new(); + let (_path_str, seq_str, arraypath_str) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&str); + dr.seq = seq_str.parse().unwrap(); + if !arraypath_str.is_empty() { + for numstr in arraypath_str.split(",") { + dr.arraypath.push(numstr.parse().unwrap()); + } + } + dr + } + + pub fn compare_keys(akey: &str, bkey: &str) -> Ordering { + debug_assert!(akey.starts_with('W') || akey.starts_with('f') || akey.starts_with('T') || + akey.starts_with('F') || + akey.starts_with('N')); + debug_assert!(bkey.starts_with('W') || bkey.starts_with('f') || bkey.starts_with('T') || + bkey.starts_with('F') || + bkey.starts_with('N')); + let (apath_str, aseq_str, aarraypath_str) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); + let (bpath_str, bseq_str, barraypath_str) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); + + match apath_str[0..].cmp(&bpath_str[0..]) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => { + let aseq: u64 = aseq_str.parse().unwrap(); + let bseq: u64 = bseq_str.parse().unwrap(); + if aseq < bseq { + Ordering::Less + } else if aseq > bseq { + Ordering::Greater + } else { + if aarraypath_str.is_empty() || barraypath_str.is_empty() { + aarraypath_str.len().cmp(&barraypath_str.len()) + } else { + let mut a_nums = aarraypath_str.split(","); + let mut b_nums = barraypath_str.split(","); + loop { + if let Some(ref a_num_str) = a_nums.next() { + if let Some(ref b_num_str) = b_nums.next() { + let a_num: u64 = a_num_str.parse().unwrap(); + let b_num: u64 = b_num_str.parse().unwrap(); + match a_num.cmp(&b_num) { + Ordering::Less => return Ordering::Less, + Ordering::Greater => return Ordering::Greater, + Ordering::Equal => (), + } + } else { + //b is shorter than a, so greater + return Ordering::Greater; + } + } else { + if b_nums.next().is_some() { + //a is shorter than b so less + return Ordering::Less; + } else { + // same length and must have hit all equal before this, + // so equal + return Ordering::Equal; + } + } + } + } + } + } + } } } #[cfg(test)] mod tests { - use super::{KeyBuilder, SegmentType}; + use super::KeyBuilder; + use query::DocResult; - #[test] - fn test_new_key_builder() { - let kb = KeyBuilder::new(); - assert_eq!(kb.key(), "W", "Initial value is set"); - } #[test] fn test_segments_push() { let mut kb = KeyBuilder::new(); - assert_eq!(kb.segments.len(), 0, "No segments so far"); - assert_eq!(kb.key(), "W", "Key for segments is correct"); + assert_eq!(kb.keypath_segments_len(), 0, "No segments so far"); - kb.push_object_key("first".to_string()); - assert_eq!(kb.segments.len(), 1, "One segment"); - assert_eq!(kb.key(), "W.first", "Key for one segments is correct"); + kb.push_object_key("first"); + assert_eq!(kb.keypath_segments_len(), 1, "One segment"); - kb.push_object_key("second".to_string()); - assert_eq!(kb.segments.len(), 2, "Two segments"); - assert_eq!(kb.key(), "W.first.second", "Key for two segments is correct"); + kb.push_object_key("second"); + assert_eq!(kb.keypath_segments_len(), 2, "Two segments"); kb.push_array(); - assert_eq!(kb.segments.len(), 3, "Three segments "); - assert_eq!(kb.key(), "W.first.second$", "Key for three segments is correct"); - - kb.push_word("astemmedword".to_string()); - assert_eq!(kb.segments.len(), 4, "Four segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#", "Key for four segments is correct"); - - kb.push_doc_seq(123); - assert_eq!(kb.segments.len(), 5, "Five segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#123", - "Key for five segments is correct"); - } - - #[test] - #[should_panic(expected = "assertion failed: self.segments.len() > 0")] - fn test_segments_push_doc_seq_panic() { - let mut kb = KeyBuilder::new(); - kb.push_doc_seq(456); - } - - #[test] - #[should_panic(expected = "assertion failed: self.segments.len() > 0")] - fn test_segments_push_word_panic() { - let mut kb = KeyBuilder::new(); - kb.push_word("astemmedword".to_string()); + assert_eq!(kb.keypath_segments_len(), 3, "Three segments "); } #[test] fn test_segments_pop() { let mut kb = KeyBuilder::new(); - kb.push_object_key("first".to_string()); - kb.push_object_key("second".to_string()); + kb.push_object_key("first"); + kb.push_object_key("second"); kb.push_array(); - kb.push_word("astemmedword".to_string()); - kb.push_doc_seq(123); - assert_eq!(kb.segments.len(), 5, "Five segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#123", - "Key for five segments is correct"); - kb.pop_doc_seq(); - assert_eq!(kb.segments.len(), 4, "Four segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#", "Key for four segments is correct"); + assert_eq!(kb.keypath_segments_len(), 3, "three segments"); + assert_eq!(kb.stemmed_word_key("astemmedword", 123), + "W.first.second$!astemmedword#123,0", + "Key for six segments is correct"); - kb.pop_word(); - assert_eq!(kb.segments.len(), 3, "Three segments "); - assert_eq!(kb.key(), "W.first.second$", "Key for three segments is correct"); kb.pop_array(); - assert_eq!(kb.segments.len(), 2, "Two segments"); - assert_eq!(kb.key(), "W.first.second", "Key for two segments is correct"); + assert_eq!(kb.keypath_segments_len(), 2, "Two segments"); kb.pop_object_key(); - assert_eq!(kb.segments.len(), 1, "One segment"); - assert_eq!(kb.key(), "W.first", "Key for one segments is correct"); + assert_eq!(kb.keypath_segments_len(), 1, "One segment"); kb.pop_object_key(); - assert_eq!(kb.segments.len(), 0, "No segments so far"); - assert_eq!(kb.key(), "W", "Key for segments is correct"); + assert_eq!(kb.keypath_segments_len(), 0, "No segments so far"); } #[test] - fn test_last_pushed_segment_type() { - let mut kb = KeyBuilder::new(); - assert_eq!(kb.last_pushed_segment_type(), None, "No segments"); - - kb.push_object_key("first".to_string()); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::ObjectKey), - "Last segment is an object key"); - - kb.push_object_key("second".to_string()); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::ObjectKey), - "Last segment is an object key"); - - kb.push_array(); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::Array), - "Last segment is an array"); - - kb.push_word("astemmedword".to_string()); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::Word), - "Last segment is a word"); - - kb.push_doc_seq(123); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::DocSeq), - "Last segment is a doc sequence"); + fn test_doc_result_parse() { + let key = "W.foo$.bar$!word#123,1,0".to_string(); + let (keypathstr, seqstr, arraypathstr) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&key); + assert_eq!(keypathstr, "W.foo$.bar$!word"); + assert_eq!(seqstr, "123"); + assert_eq!(arraypathstr, "1,0"); + + // make sure escaped commas and # in key path don't cause problems + let key1 = "W.foo\\#$.bar\\,$!word#123,2,0".to_string(); + let (keypathstr1, seqstr1, arraypathstr1) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&key1); + assert_eq!(keypathstr1, "W.foo\\#$.bar\\,$!word"); + assert_eq!(seqstr1, "123"); + assert_eq!(arraypathstr1, "2,0"); + + let mut dr = DocResult::new(); + dr.seq = 123; + dr.arraypath = vec![1, 0]; + + assert!(dr == KeyBuilder::parse_doc_result_from_key(&key)); } } diff --git a/src/lib.rs b/src/lib.rs index 4adf6aa..f477d26 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,14 +1,15 @@ -extern crate capnp; extern crate rocksdb; +mod aggregates; +mod error; +mod filters; mod json_shred; mod key_builder; +mod parser; +mod snapshot; mod stems; +mod returnable; +pub mod repl; +pub mod json_value; pub mod index; -mod query; - -// include capnp code generated by `build.rs` -mod records_capnp { - #![allow(dead_code)] - include!(concat!(env!("OUT_DIR"), "/records_capnp.rs")); -} +pub mod query; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..56bb617 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,18 @@ +extern crate noise_search; + +use noise_search::repl::repl; + +use std::env; +use std::io::{self, BufReader}; + +fn main() { + let mut test_mode = false; + for argument in env::args() { + if argument == "-t" { + test_mode = true; + } + } + repl(&mut BufReader::new(io::stdin()), + &mut io::stdout(), + test_mode); +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..3f894e6 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,1316 @@ + +use std; +use std::str; +use std::collections::HashMap; +use std::iter::Iterator; +use std::usize; + +use error::Error; +use key_builder::KeyBuilder; +use stems::Stems; +use json_value::JsonValue; +use query::{Order, OrderInfo, OrderField}; +use aggregates::AggregateFun; +use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore, + ReturnPath}; +use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, + StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter, + NotFilter, RangeFilter, RangeOperator, AllDocsFilter}; +use snapshot::Snapshot; + + +pub struct Parser<'a, 'c> { + query: &'c str, + offset: usize, + kb: KeyBuilder, + pub snapshot: Snapshot<'a>, + pub needs_scoring: bool, +} + +impl<'a, 'c> Parser<'a, 'c> { + pub fn new(query: &'c str, snapshot: Snapshot<'a>) -> Parser<'a, 'c> { + Parser { + query: query, + offset: 0, + kb: KeyBuilder::new(), + snapshot: snapshot, + needs_scoring: false, + } + } + + fn ws(&mut self) { + for char in self.query[self.offset..].chars() { + if !char.is_whitespace() { + break; + } + self.offset += char.len_utf8(); + } + } + + fn consume(&mut self, token: &str) -> bool { + if self.could_consume(token) { + self.offset += token.len(); + self.ws(); + true + } else { + false + } + } + + fn consume_no_ws(&mut self, token: &str) -> bool { + if self.could_consume(token) { + self.offset += token.len(); + true + } else { + false + } + } + + + fn must_consume(&mut self, token: &str) -> Result<(), Error> { + if self.could_consume(token) { + self.offset += token.len(); + self.ws(); + Ok(()) + } else { + if self.offset == self.query.len() { + Err(Error::Parse(format!("Expected '{}' at character {} but query string ended.", + token, + self.offset))) + } else { + Err(Error::Parse(format!("Expected '{}' at character {}, found {}.", + token, + self.offset, + &self.query[self.offset..self.offset + 1]))) + } + } + } + + fn could_consume(&self, token: &str) -> bool { + self.query[self.offset..].starts_with(token) + } + + fn consume_key(&mut self) -> Result, Error> { + if let Some(key) = self.consume_field() { + Ok(Some(key)) + } else if let Some(key) = try!(self.consume_string_literal()) { + Ok(Some(key)) + } else { + Ok(None) + } + } + + fn consume_field(&mut self) -> Option { + let mut result = String::new(); + { + let mut chars = self.query[self.offset..].chars(); + if let Some(c) = chars.next() { + // first char cannot be numeric + if c.is_alphabetic() || '_' == c || '$' == c { + result.push(c); + for c in chars { + if c.is_alphanumeric() || '_' == c || '$' == c { + result.push(c); + } else { + break; + } + } + } + } + } + if result.len() > 0 { + self.offset += result.len(); + self.ws(); + Some(result) + } else { + None + } + } + + fn consume_integer(&mut self) -> Result, Error> { + let mut result = String::new(); + for char in self.query[self.offset..].chars() { + if char >= '0' && char <= '9' { + result.push(char); + } else { + break; + } + } + if !result.is_empty() { + self.offset += result.len(); + self.ws(); + Ok(Some(try!(result.parse()))) + } else { + Ok(None) + } + } + + fn consume_default(&mut self) -> Result, Error> { + if self.consume("default") { + try!(self.must_consume("=")); + if let Some(json) = try!(self.json()) { + Ok(Some(json)) + } else { + Err(Error::Parse("Expected json value for default".to_string())) + } + } else { + Ok(None) + } + } + + fn consume_aggregate(&mut self) + -> Result, /*optional bind var name*/ + ReturnPath, + JsonValue)>, + Error> { + let offset = self.offset; + let mut aggregate_fun = if self.consume("group") { + AggregateFun::GroupAsc + } else if self.consume("sum") { + AggregateFun::Sum + } else if self.consume("max_array") { + AggregateFun::MaxArray + } else if self.consume("max") { + AggregateFun::Max + } else if self.consume("min_array") { + AggregateFun::MinArray + } else if self.consume("min") { + AggregateFun::Min + } else if self.consume("array_flat") { + AggregateFun::ArrayFlat + } else if self.consume("array") { + AggregateFun::Array + } else if self.consume("concat") { + AggregateFun::Concat + } else if self.consume("avg") { + AggregateFun::Avg + } else if self.consume("count") { + AggregateFun::Count + } else { + return Ok(None); + }; + + if self.consume("(") { + if aggregate_fun == AggregateFun::Count { + try!(self.must_consume(")")); + Ok(Some((aggregate_fun, None, ReturnPath::new(), JsonValue::Null))) + } else if aggregate_fun == AggregateFun::Concat { + let bind_name_option = self.consume_field(); + + if let Some(rp) = try!(self.consume_keypath()) { + let json = if self.consume("sep") { + try!(self.must_consume("=")); + JsonValue::String(try!(self.must_consume_string_literal())) + } else { + JsonValue::String(",".to_string()) + }; + try!(self.must_consume(")")); + Ok(Some((aggregate_fun, bind_name_option, rp, json))) + } else { + Err(Error::Parse("Expected keypath or bind variable".to_string())) + } + } else { + let bind_name_option = self.consume_field(); + + if let Some(rp) = try!(self.consume_keypath()) { + if self.consume("order") { + try!(self.must_consume("=")); + if self.consume("asc") { + aggregate_fun = AggregateFun::GroupAsc; + } else if self.consume("desc") { + aggregate_fun = AggregateFun::GroupDesc; + } else { + return Err(Error::Parse("Expected asc or desc".to_string())); + } + } + try!(self.must_consume(")")); + + Ok(Some((aggregate_fun, bind_name_option, rp, JsonValue::Null))) + } else { + Err(Error::Parse("Expected keypath or bind variable".to_string())) + } + } + } else { + // this consumed word above might be a Bind var. Unconsume and return nothing. + self.offset = offset; + Ok(None) + } + } + + fn consume_keypath(&mut self) -> Result, Error> { + let key: String = if self.consume_no_ws(".") { + if self.consume("[") { + let key = try!(self.must_consume_string_literal()); + try!(self.must_consume("]")); + key + } else { + if let Some(key) = self.consume_field() { + key + } else { + self.ws(); + // this means return the whole document + return Ok(Some(ReturnPath::new())); + } + } + } else { + return Ok(None); + }; + + let mut ret_path = ReturnPath::new(); + ret_path.push_object_key(key); + loop { + if self.consume("[") { + if let Some(index) = try!(self.consume_integer()) { + ret_path.push_array(index as u64); + try!(self.must_consume("]")); + } else { + if self.consume("]") { + ret_path.push_array_all(); + } else { + return Err(Error::Parse("Expected array index integer or *.".to_string())); + } + } + } else if self.consume(".") { + if let Some(key) = self.consume_field() { + ret_path.push_object_key(key); + } else { + return Err(Error::Parse("Expected object key.".to_string())); + } + } else { + break; + } + } + self.ws(); + Ok(Some(ret_path)) + } + + // if no boost is specified returns 1.0 + fn consume_boost(&mut self) -> Result { + if self.consume("^") { + if let Some(num) = try!(self.consume_number()) { + Ok(num as f32) + } else { + return Err(Error::Parse("Expected number after ^ symbol.".to_string())); + } + } else { + Ok(1.0) + } + } + + fn consume_boost_and_wrap_filter(&mut self, + filter: Box) + -> Result, Error> { + let boost = try!(self.consume_boost()); + if boost != 1.0 { + Ok(Box::new(BoostFilter::new(filter, boost))) + } else { + Ok(filter) + } + } + + fn consume_number(&mut self) -> Result, Error> { + // Yes this parsing code is hideously verbose. But it conforms exactly to the json spec + // and uses the rust f64 parser, which can't tell us how many characters it used or needs. + + // At the end it then uses the std rust String::parse() method to parse and return + // the f64 value and advance the self.offset. The rust method is a super set of the + // allowable json syntax, so it will parse any valid json floating point number. It might + // return an error if the number is out of bounds. + let mut result = String::new(); + 'outer: loop { + // this loop isn't a loop, it's just there to scope the self borrow + // and then jump to the end to do another borrow (self.ws()) + let mut chars = self.query[self.offset..].chars(); + let mut c = if let Some(c) = chars.next() { + c + } else { + return Ok(None); + }; + + // parse the sign + c = if c == '-' { + result.push('-'); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected digits after sign (-).".to_string())); + } + } else { + c + }; + + // parse the first digit + let mut leading_zero = false; + c = if c == '0' { + result.push('0'); + leading_zero = true; + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else if c >= '1' && c <= '9' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else if result.is_empty() { + // no sign or digits found. not a number + return Ok(None); + } else { + return Err(Error::Parse("Expected digits after sign (-).".to_string())); + }; + + // parse remaning significant digits + if !leading_zero { + // no more digits allowed if first digit is zero + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + break; + }; + } + } + + // parse decimal + c = if c == '.' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected digits after decimal point.".to_string())); + } + } else { + break 'outer; + }; + + // parse mantissa + let mut found_mantissa = false; + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + found_mantissa = true; + + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + if found_mantissa { + break; + } + return Err(Error::Parse("Expected digits after decimal point.".to_string())); + }; + } + + // parse exponent symbol + c = if c == 'e' || c == 'E' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } else { + break 'outer; + }; + + // parse exponent sign + c = if c == '+' || c == '-' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } else { + c + }; + + // parse exponent digits + let mut found_exponent = false; + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + found_exponent = true; + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + if found_exponent { + break 'outer; + } + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } + } + + self.offset += result.len(); + self.ws(); + Ok(Some(try!(result.parse()))) + } + + + fn must_consume_string_literal(&mut self) -> Result { + if let Some(string) = try!(self.consume_string_literal()) { + Ok(string) + } else { + Err(Error::Parse("Expected string literal.".to_string())) + } + } + + fn consume_string_literal(&mut self) -> Result, Error> { + let mut lit = String::new(); + if !self.could_consume("\"") { + return Ok(None); + } + // can't consume("\"") the leading quote because it will also skip leading whitespace + // inside the string literal + self.offset += 1; + { + let mut chars = self.query[self.offset..].chars(); + 'outer: loop { + let char = if let Some(char) = chars.next() { + char + } else { + break; + }; + if char == '\\' { + self.offset += 1; + + let char = if let Some(char) = chars.next() { + char + } else { + break; + }; + match char { + '\\' | '"' | '/' => lit.push(char), + 'n' => lit.push('\n'), + 'b' => lit.push('\x08'), + 'r' => lit.push('\r'), + 'f' => lit.push('\x0C'), + 't' => lit.push('\t'), + 'v' => lit.push('\x0B'), + 'u' => { + let mut n = 0; + for _i in 0..4 { + let char = if let Some(char) = chars.next() { + char + } else { + break 'outer; + }; + n = match char { + c @ '0'...'9' => n * 16 + ((c as u16) - ('0' as u16)), + c @ 'a'...'f' => n * 16 + (10 + (c as u16) - ('a' as u16)), + c @ 'A'...'F' => n * 16 + (10 + (c as u16) - ('A' as u16)), + _ => { + let msg = format!("Invalid hexidecimal escape: {}", char); + return Err(Error::Parse(msg)); + } + }; + + } + self.offset += 3; // 3 because 1 is always added after the match below + } + _ => { + return Err(Error::Parse(format!("Unknown character escape: {}", char))) + } + }; + self.offset += 1; + } else { + if char == '"' { + break; + } else { + lit.push(char); + self.offset += char.len_utf8(); + } + } + } + } + try!(self.must_consume("\"")); + Ok(Some(lit)) + } + + fn consume_range_operator(&mut self) -> Result { + let inclusive = self.consume("="); + let json = try!(self.must_consume_json_primitive()); + match json { + JsonValue::Number(num) => { + if inclusive { + Ok(RangeOperator::Inclusive(num)) + } else { + Ok(RangeOperator::Exclusive(num)) + } + } + _ => panic!("Range operator on other JSON types is not yet implemented!"), + } + } + + fn find<'b>(&'b mut self) -> Result, Error> { + if !self.consume("find") { + return Err(Error::Parse("Missing 'find' keyword".to_string())); + } + self.not_object() + } + + fn not_object<'b>(&'b mut self) -> Result, Error> { + if self.consume("!") { + let filter = try!(self.object()); + Ok(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone()))) + } else { + self.object() + } + } + + fn object<'b>(&'b mut self) -> Result, Error> { + if self.consume("{") { + if self.consume("}") { + return Ok(Box::new(AllDocsFilter::new(&self.snapshot))); + } + let mut left = try!(self.obool()); + try!(self.must_consume("}")); + + left = try!(self.consume_boost_and_wrap_filter(left)); + + if self.consume("&&") { + let right = try!(self.not_object()); + Ok(Box::new(AndFilter::new(vec![left, right], self.kb.arraypath_len()))) + + } else if self.consume("||") { + let right = try!(self.not_object()); + Ok(Box::new(OrFilter::new(left, right, self.kb.arraypath_len()))) + } else { + Ok(left) + } + } else { + self.parens() + } + } + + fn parens<'b>(&'b mut self) -> Result, Error> { + if self.consume("!") { + let filter = try!(self.parens()); + return Ok(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone()))); + } + try!(self.must_consume("(")); + let filter = try!(self.object()); + try!(self.must_consume(")")); + + self.consume_boost_and_wrap_filter(filter) + } + + fn obool<'b>(&'b mut self) -> Result, Error> { + let mut filter = try!(self.ocompare()); + loop { + filter = if self.consume("&&") || self.consume(",") { + let right = try!(self.obool()); + Box::new(AndFilter::new(vec![filter, right], self.kb.arraypath_len())) + } else if self.consume("||") { + let right = try!(self.obool()); + Box::new(OrFilter::new(filter, right, self.kb.arraypath_len())) + } else { + break; + } + } + Ok(filter) + } + + fn ocompare<'b>(&'b mut self) -> Result, Error> { + if let Some(filter) = try!(self.oparens()) { + Ok(filter) + } else if let Some(field) = try!(self.consume_key()) { + self.kb.push_object_key(&field); + try!(self.must_consume(":")); + if let Some(filter) = try!(self.oparens()) { + self.kb.pop_object_key(); + Ok(filter) + } else { + let filter = try!(self.compare()); + self.kb.pop_object_key(); + Ok(filter) + } + } else { + Err(Error::Parse("Expected object key or '('".to_string())) + } + } + + fn oparens<'b>(&'b mut self) -> Result>, Error> { + let offset = self.offset; + if self.consume("!") { + if let Some(f) = try!(self.oparens()) { + return Ok(Some(Box::new(NotFilter::new(&self.snapshot, f, self.kb.clone())))); + } else { + self.offset = offset; + return Ok(None); + } + } + let opt_filter = if self.consume("(") { + let f = try!(self.obool()); + try!(self.must_consume(")")); + Some(f) + } else if self.could_consume("[") { + Some(try!(self.array())) + } else if self.could_consume("{") { + Some(try!(self.object())) + } else { + if let Some(filter) = try!(self.bind_var()) { + Some(filter) + } else { + None + } + }; + + if let Some(filter) = opt_filter { + Ok(Some(try!(self.consume_boost_and_wrap_filter(filter)))) + } else { + Ok(None) + } + } + + fn compare<'b>(&'b mut self) -> Result, Error> { + if let Some(filter) = try!(self.equal()) { + Ok(filter) + } else if let Some(filter) = try!(self.stemmed()) { + Ok(filter) + } else { + if self.consume(">") { + let min = try!(self.consume_range_operator()); + let filter = RangeFilter::new(&self.snapshot, self.kb.clone(), Some(min), None); + Ok(Box::new(filter)) + } else if self.consume("<") { + let max = try!(self.consume_range_operator()); + let filter = RangeFilter::new(&self.snapshot, self.kb.clone(), None, Some(max)); + Ok(Box::new(filter)) + } else { + Err(Error::Parse("Expected comparison operator".to_string())) + } + } + } + + fn equal<'b>(&'b mut self) -> Result>, Error> { + let not_equal = self.consume("!="); + if not_equal || self.consume("==") { + let json = try!(self.must_consume_json_primitive()); + let boost = try!(self.consume_boost()); + let filter: Box = match json { + JsonValue::String(literal) => { + let mut filters: Vec = Vec::new(); + { + let stems = Stems::new(&literal); + for stem in stems { + filters.push(StemmedWordPosFilter::new(&self.snapshot, + &stem.stemmed, + &self.kb, + boost)); + } + } + let filter = StemmedPhraseFilter::new(filters); + Box::new(ExactMatchFilter::new(&self.snapshot, + filter, + self.kb.clone(), + literal, + true)) + } + JsonValue::Number(num) => { + Box::new(RangeFilter::new(&self.snapshot, + self.kb.clone(), + Some(RangeOperator::Inclusive(num)), + Some(RangeOperator::Inclusive(num)))) + } + JsonValue::True => { + Box::new(RangeFilter::new(&self.snapshot, + self.kb.clone(), + Some(RangeOperator::True), + Some(RangeOperator::True))) + } + JsonValue::False => { + Box::new(RangeFilter::new(&self.snapshot, + self.kb.clone(), + Some(RangeOperator::False), + Some(RangeOperator::False))) + } + JsonValue::Null => { + Box::new(RangeFilter::new(&self.snapshot, + self.kb.clone(), + Some(RangeOperator::Null), + Some(RangeOperator::Null))) + } + _ => panic!("Exact match on other JSON types is not yet implemented!"), + }; + if not_equal { + Ok(Some(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone())))) + } else { + Ok(Some(filter)) + } + } else { + Ok(None) + } + } + + fn stemmed<'b>(&'b mut self) -> Result>, Error> { + let not_stemmed = self.consume("!~="); + if not_stemmed || self.consume("~=") { + // regular search + let literal = try!(self.must_consume_string_literal()); + let boost = try!(self.consume_boost()); + let stems = Stems::new(&literal); + let stemmed_words: Vec = stems.map(|stem| stem.stemmed).collect(); + + let filter: Box = match stemmed_words.len() { + 0 => panic!("Cannot create a StemmedWordFilter"), + 1 => { + Box::new(StemmedWordFilter::new(&self.snapshot, + &stemmed_words[0], + &self.kb, + boost)) + } + _ => { + let mut filters: Vec = Vec::new(); + for stemmed_word in stemmed_words { + let filter = StemmedWordPosFilter::new(&self.snapshot, + &stemmed_word, + &self.kb, + boost); + filters.push(filter); + } + Box::new(StemmedPhraseFilter::new(filters)) + } + }; + if not_stemmed { + Ok(Some(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone())))) + } else { + Ok(Some(filter)) + } + } else if not_stemmed || self.consume("~") { + let word_distance = match try!(self.consume_integer()) { + Some(int) => int, + None => { + return Err(Error::Parse("Expected integer for proximity search".to_string())); + } + }; + try!(self.must_consume("=")); + + let literal = try!(self.must_consume_string_literal()); + let boost = try!(self.consume_boost()); + let stems = Stems::new(&literal); + let mut filters: Vec = Vec::new(); + for stem in stems { + let filter = + StemmedWordPosFilter::new(&self.snapshot, &stem.stemmed, &self.kb, boost); + filters.push(filter); + } + if word_distance > std::u32::MAX as i64 { + return Err(Error::Parse("Proximity search number too large.".to_string())); + } + match filters.len() { + 0 => panic!("Cannot create a DistanceFilter"), + _ => { + let filter = Box::new(DistanceFilter::new(filters, word_distance as u32)); + if not_stemmed { + Ok(Some(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone())))) + } else { + Ok(Some(filter)) + } + } + } + } else { + Ok(None) + } + } + + fn abool<'b>(&'b mut self) -> Result, Error> { + let mut filter = try!(self.acompare()); + loop { + filter = if self.consume("&&") || self.consume(",") { + let right = try!(self.abool()); + Box::new(AndFilter::new(vec![filter, right], self.kb.arraypath_len())) + } else if self.consume("||") { + let right = try!(self.abool()); + Box::new(OrFilter::new(filter, right, self.kb.arraypath_len())) + } else { + break; + } + } + Ok(filter) + } + + fn acompare<'b>(&'b mut self) -> Result, Error> { + if let Some(filter) = try!(self.aparens()) { + Ok(filter) + } else { + self.compare() + } + } + + fn aparens<'b>(&'b mut self) -> Result>, Error> { + let offset = self.offset; + if self.consume("!") { + if let Some(f) = try!(self.aparens()) { + return Ok(Some(Box::new(NotFilter::new(&self.snapshot, f, self.kb.clone())))); + } else { + self.offset = offset; + return Ok(None); + } + } + let opt_filter = if self.consume("(") { + let f = try!(self.abool()); + try!(self.must_consume(")")); + Some(f) + } else if self.could_consume("[") { + Some(try!(self.array())) + } else if self.could_consume("{") { + Some(try!(self.object())) + } else { + if let Some(filter) = try!(self.bind_var()) { + Some(filter) + } else { + None + } + }; + + if let Some(filter) = opt_filter { + Ok(Some(try!(self.consume_boost_and_wrap_filter(filter)))) + } else { + Ok(None) + } + } + + fn bind_var<'b>(&'b mut self) -> Result>, Error> { + let offset = self.offset; + if let Some(bind_name) = self.consume_field() { + if self.consume("::") { + let filter = try!(self.array()); + self.kb.push_array(); + let kb_clone = self.kb.clone(); + self.kb.pop_array(); + return Ok(Some(Box::new(BindFilter::new(bind_name, filter, kb_clone)))); + } + //we got here so unconsume the chars + self.offset = offset; + } + Ok(None) + } + + fn array<'b>(&'b mut self) -> Result, Error> { + if !self.consume("[") { + return Err(Error::Parse("Expected '['".to_string())); + } + self.kb.push_array(); + let filter = try!(self.abool()); + self.kb.pop_array(); + try!(self.must_consume("]")); + + self.consume_boost_and_wrap_filter(filter) + } + + pub fn order_clause(&mut self) -> Result, Error> { + let mut order_infos = HashMap::new(); + if self.consume("order") { + let mut n = 0; + loop { + if let Some(rp) = try!(self.consume_keypath()) { + // doing the search for source 2x so user can order + // anyway they like. Yes it's a hack, but it simple. + let mut order = if self.consume("asc") { + Order::Asc + } else if self.consume("desc") { + Order::Desc + } else { + Order::Asc + }; + + let default = if self.consume("default") { + try!(self.must_consume("=")); + if let Some(json) = try!(self.json()) { + json + } else { + return Err(Error::Parse("Expected Json after default.".to_string())); + } + } else { + JsonValue::Null + }; + + order = if self.consume("asc") { + Order::Asc + } else if self.consume("desc") { + Order::Desc + } else { + order + }; + + order_infos.insert(rp.to_key(), + OrderInfo { + field: OrderField::FetchValue(rp), + order: order, + order_to_apply: n, + default: default, + }); + } else { + try!(self.must_consume("score")); + try!(self.must_consume("(")); + try!(self.must_consume(")")); + + self.needs_scoring = true; + + let order = if self.consume("asc") { + Order::Asc + } else if self.consume("desc") { + Order::Desc + } else { + Order::Asc + }; + + order_infos.insert("score()".to_string(), + OrderInfo { + field: OrderField::Score, + order_to_apply: n, + order: order, + default: JsonValue::Null, + }); + } + + if !self.consume(",") { + break; + } + n += 1; + } + if order_infos.is_empty() { + return Err(Error::Parse("Expected field path in order expression.".to_string())); + } + } + Ok(order_infos) + } + + pub fn return_clause(&mut self) -> Result, Error> { + if self.consume("return") { + if let Some(ret_value) = try!(self.ret_value()) { + Ok(ret_value) + } else { + Err(Error::Parse("Expected key, object or array to return.".to_string())) + } + } else { + let mut rp = ReturnPath::new(); + rp.push_object_key("_id".to_string()); + Ok(Box::new(RetValue { + rp: rp, + ag: None, + default: JsonValue::Null, + order_info: None, + })) + } + } + + fn ret_object(&mut self) -> Result, Error> { + try!(self.must_consume("{")); + let mut fields: Vec<(String, Box)> = Vec::new(); + loop { + if let Some(field) = try!(self.consume_key()) { + try!(self.must_consume(":")); + if let Some(ret_value) = try!(self.ret_value()) { + fields.push((field, ret_value)); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Expected key to return.".to_string())); + } + } else { + break; + } + } + + try!(self.must_consume("}")); + Ok(Box::new(RetObject { fields: fields })) + } + + fn ret_array(&mut self) -> Result, Error> { + try!(self.must_consume("[")); + let mut slots = Vec::new(); + loop { + if let Some(ret_value) = try!(self.ret_value()) { + slots.push(ret_value); + if !self.consume(",") { + break; + } + } else { + break; + } + } + try!(self.must_consume("]")); + Ok(Box::new(RetArray { slots: slots })) + + } + + fn ret_value(&mut self) -> Result>, Error> { + if self.consume("true") { + return Ok(Some(Box::new(RetLiteral { json: JsonValue::True }))); + } else if self.consume("false") { + return Ok(Some(Box::new(RetLiteral { json: JsonValue::False }))); + } else if self.consume("null") { + return Ok(Some(Box::new(RetLiteral { json: JsonValue::Null }))); + } else if self.could_consume("score") { + let offset = self.offset; + let _ = self.consume("score"); + if self.consume("(") { + try!(self.must_consume(")")); + self.needs_scoring = true; + return Ok(Some(Box::new(RetScore { order_info: None }))); + } else { + //wasn't the score, maybe it's a bind variable + self.offset = offset; + } + } + + if let Some((ag, bind_name_option, rp, json)) = try!(self.consume_aggregate()) { + let default = if let Some(default) = try!(self.consume_default()) { + default + } else { + JsonValue::Null + }; + if let Some(bind_name) = bind_name_option { + Ok(Some(Box::new(RetBind { + bind_name: bind_name, + extra_rp: rp, + ag: Some((ag, json)), + default: default, + order_info: None, + }))) + } else { + Ok(Some(Box::new(RetValue { + rp: rp, + ag: Some((ag, json)), + default: default, + order_info: None, + }))) + } + } else if let Some(bind_name) = self.consume_field() { + let rp = if let Some(rp) = try!(self.consume_keypath()) { + rp + } else { + ReturnPath::new() + }; + + let default = if let Some(default) = try!(self.consume_default()) { + default + } else { + JsonValue::Null + }; + + Ok(Some(Box::new(RetBind { + bind_name: bind_name, + extra_rp: rp, + ag: None, + default: default, + order_info: None, + }))) + } else if let Some(rp) = try!(self.consume_keypath()) { + let default = if let Some(default) = try!(self.consume_default()) { + default + } else { + JsonValue::Null + }; + + Ok(Some(Box::new(RetValue { + rp: rp, + ag: None, + default: default, + order_info: None, + }))) + } else if self.could_consume("{") { + Ok(Some(try!(self.ret_object()))) + } else if self.could_consume("[") { + Ok(Some(try!(self.ret_array()))) + } else if let Some(string) = try!(self.consume_string_literal()) { + Ok(Some(Box::new(RetLiteral { json: JsonValue::String(string) }))) + } else if let Some(num) = try!(self.consume_number()) { + Ok(Some(Box::new(RetLiteral { json: JsonValue::Number(num) }))) + } else { + Ok(None) + } + } + + pub fn limit_clause(&mut self) -> Result { + if self.consume("limit") { + if let Some(i) = try!(self.consume_integer()) { + if i <= 0 { + return Err(Error::Parse("limit must be an integer greater than 0".to_string())); + } + Ok(i as usize) + } else { + return Err(Error::Parse("limit expects an integer greater than 0".to_string())); + } + } else { + Ok(usize::MAX) + } + } + + fn json(&mut self) -> Result, Error> { + if self.could_consume("{") { + Ok(Some(try!(self.json_object()))) + } else if self.could_consume("[") { + Ok(Some(try!(self.json_array()))) + } else { + Ok(try!(self.json_primitive())) + } + } + + fn must_consume_json_primitive(&mut self) -> Result { + if let Some(json) = try!(self.json_primitive()) { + Ok(json) + } else { + Err(Error::Parse("Expected JSON primitive.".to_string())) + } + } + + /// JSON primites are strings, numbers, booleans and null + fn json_primitive(&mut self) -> Result, Error> { + if let Some(string) = try!(self.consume_string_literal()) { + Ok(Some(JsonValue::String(string))) + } + // The else is needed becaue of https://github.com/rust-lang/rust/issues/37510 + else { + if self.consume("true") { + Ok(Some(JsonValue::True)) + } else if self.consume("false") { + Ok(Some(JsonValue::False)) + } else if self.consume("null") { + Ok(Some(JsonValue::Null)) + } else if let Some(num) = try!(self.consume_number()) { + Ok(Some(JsonValue::Number(num))) + } else { + Ok(None) + } + } + } + + fn json_object(&mut self) -> Result { + try!(self.must_consume("{")); + let mut object = Vec::new(); + if self.consume("}") { + return Ok(JsonValue::Object(object)); + } + loop { + if let Some(field) = try!(self.consume_key()) { + try!(self.must_consume(":")); + if let Some(json) = try!(self.json()) { + object.push((field, json)); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } + try!(self.must_consume("}")); + Ok(JsonValue::Object(object)) + } + + fn json_array(&mut self) -> Result { + try!(self.must_consume("[")); + let mut array = Vec::new(); + if self.consume("]") { + return Ok(JsonValue::Array(array)); + } + loop { + if let Some(json) = try!(self.json()) { + array.push(json); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } + try!(self.must_consume("]")); + Ok(JsonValue::Array(array)) + } + + pub fn build_filter(&mut self) -> Result, Error> { + self.ws(); + Ok(try!(self.find())) + } + + pub fn non_ws_left(&mut self) -> Result<(), Error> { + self.ws(); + if self.offset != self.query.len() { + Err(Error::Parse(format!("At character {} unexpected {}.", + self.offset, + &self.query[self.offset..]))) + } else { + Ok(()) + } + } +} + +#[cfg(test)] +mod tests { + + use super::Parser; + + use index::{Index, OpenOptions}; + + #[test] + fn test_whitespace() { + let dbname = "target/tests/test_whitespace"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let mut snapshot = index.new_snapshot(); + + let query = " \n \t test"; + let mut parser = Parser::new(query, snapshot); + parser.ws(); + assert_eq!(parser.offset, 5); + + snapshot = index.new_snapshot(); + let query = "test".to_string(); + let mut parser = Parser::new(&query, snapshot); + parser.ws(); + assert_eq!(parser.offset, 0); + } + + #[test] + fn test_must_consume_string_literal() { + let dbname = "target/tests/test_must_consume_string_literal"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let snapshot = index.new_snapshot(); + + let query = r#"" \n \t test""#.to_string(); + let mut parser = Parser::new(&query, snapshot); + assert_eq!(parser.must_consume_string_literal().unwrap(), + " \n \t test".to_string()); + } + + #[test] + fn test_bad_query_syntax() { + let dbname = "target/tests/test_bad_query_syntax"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let snapshot = index.new_snapshot(); + + let query = r#"find {foo: =="bar""#.to_string(); + let mut parser = Parser::new(&query, snapshot); + assert!(parser.find().is_err()); + } +} diff --git a/src/query.rs b/src/query.rs index 7a8d9da..95fcc10 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,398 +1,755 @@ -#![allow(dead_code)] -#![allow(unused_variables)] -extern crate capnp; -use std::{error, fmt, str}; +use std::str; +use std::cmp::Ordering; +use std::collections::HashMap; +use std::mem::swap; +use std::collections::VecDeque; +use std::iter::Iterator; +use std::usize; +use error::Error; use index::Index; -use key_builder::KeyBuilder; -use stems::{StemmedWord, Stems}; - -// TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{DBIterator, SeekKey}; -use rocksdb::rocksdb::Snapshot; -use records_capnp::payload; - -#[derive(Debug)] -enum Error<'a> { - Parse(&'a str), - Capnp(capnp::Error), +use parser::Parser; +use json_value::JsonValue; +use filters::QueryRuntimeFilter; +use aggregates::AggregateFun; +use returnable::{Returnable, RetValue, RetScore, RetHidden, ReturnPath}; +use snapshot::{Snapshot, JsonFetcher}; + + + +#[derive(Clone)] +pub struct DocResult { + pub seq: u64, + pub arraypath: Vec, + pub bind_name_result: HashMap>, + pub scores: Vec<(f32, usize)>, // (sum of score, num matches of term) } -impl<'a> error::Error for Error<'a> { - fn description(&self) -> &str { - match *self { - Error::Parse(description) => description, - Error::Capnp(ref err) => err.description(), +impl DocResult { + pub fn new() -> DocResult { + DocResult { + seq: 0, + arraypath: Vec::new(), + bind_name_result: HashMap::new(), + scores: Vec::new(), } } - fn cause(&self) -> Option<&error::Error> { - match *self { - Error::Parse(_) => None, - Error::Capnp(ref err) => Some(err as &error::Error), + pub fn add_bind_name_result(&mut self, bind_name: &str, result_key: String) { + if let Some(ref mut result_keys) = self.bind_name_result.get_mut(bind_name) { + result_keys.push(result_key); + return; } + self.bind_name_result + .insert(bind_name.to_string(), vec![result_key]); } -} -impl<'a> From for Error<'a> { - fn from(err: capnp::Error) -> Error<'a> { - Error::Capnp(err) + pub fn combine(&mut self, other: &mut DocResult) { + let mut replace = HashMap::new(); + swap(&mut replace, &mut other.bind_name_result); + for (bind_name, mut result_keys_other) in replace.into_iter() { + if let Some(ref mut result_keys) = self.bind_name_result.get_mut(&bind_name) { + result_keys.append(&mut result_keys_other); + continue; + } + self.bind_name_result + .insert(bind_name, result_keys_other); + } + self.scores.append(&mut other.scores); } -} -impl<'a> fmt::Display for Error<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - Error::Parse(ref err) => write!(f, "Parse error: {}", err), - Error::Capnp(ref err) => write!(f, "Capnproto error: {}", err), + pub fn add_score(&mut self, term_ordinal: usize, score: f32) { + if term_ordinal >= self.scores.len() { + self.scores.resize(term_ordinal + 1, (0.0, 0)); } + self.scores[term_ordinal].0 += score; + self.scores[term_ordinal].1 += 1; } -} + pub fn clone_only_seq_and_arraypath(&self) -> DocResult { + let mut dr = DocResult::new(); + dr.seq = self.seq; + dr.arraypath = self.arraypath.clone(); + dr + } -struct DocResult { - seq: u64, - array_paths: Vec>, -} - -impl DocResult { - fn new() -> DocResult { - DocResult { - seq: 0, - array_paths: Vec::new(), + pub fn boost_scores(&mut self, boost: f32) { + for &mut (ref mut score, ref mut _num_match) in self.scores.iter_mut() { + *score *= boost; } } -} -//trait QueryRuntimeFilter { -//struct QueryRuntimeFilter {} + pub fn less(&self, other: &DocResult, mut array_depth: usize) -> bool { + if self.seq < other.seq { + return true; + } + let mut s = self.arraypath.iter(); + let mut o = other.arraypath.iter(); + loop { + if array_depth == 0 { + return false; + } + array_depth -= 1; + if let Some(i_s) = s.next() { + if let Some(i_o) = o.next() { + if i_s < i_o { + return true; + } + } else { + // self cannot be less than other + return false; + } + } else { + loop { + if array_depth == 0 { + return false; + } + array_depth -= 1; + if let Some(i_o) = o.next() { + if *i_o > 0 { + return true; + } + } else { + return true; + } + } + } + } + } -trait QueryRuntimeFilter { - fn first_result<'a>(&'a mut self, start_id: u64) -> Result, Error<'a>>; - fn next_result<'a>(&'a mut self) -> Result, Error<'a>>; -} + // arraypaths must be the same length + pub fn cmp(&self, other: &DocResult) -> Ordering { + debug_assert_eq!(self.arraypath.len(), other.arraypath.len()); + match self.seq.cmp(&other.seq) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => self.arraypath.cmp(&other.arraypath), + } + } -pub struct Query {} + pub fn increment_last(&mut self, array_depth: usize) { + if array_depth == 0 { + self.seq += 1; + } else { + self.arraypath.resize(array_depth, 0); + if let Some(mut i) = self.arraypath.last_mut() { + *i += 1; + } + } + } -pub struct QueryResults { -} + pub fn last_segment_array_index(&self) -> Option<&u64> { + self.arraypath.last() + } -//struct SnapshotIteratorCreator { -// snapshot: rocksdb::Snapshot, -//} -// -//impl SnapshotIteratorCreator { -// fn new(db: &rocksdb::DB) { -// let snapshot = rocksdb::Snapshot::new(db); -// SnapshotIteratorCreator{ -// snapshot: snapshot, -// } -// } -// -// fn new_iterator(&self) { -// self.snapshot.iter() -// } -//} - - - -struct ExactMatchFilter<'a> { - iter: DBIterator<'a>, - kb: KeyBuilder, - stemmed_offset: u64, - suffix: String, - suffix_offset: u64, + pub fn increment_first(&mut self, array_depth: usize) { + self.seq += 1; + self.arraypath.clear(); + self.arraypath.resize(array_depth, 0); + } } -impl<'a> ExactMatchFilter<'a> { - fn new(iter: DBIterator<'a>, stemmed_word: &StemmedWord, mut kb: KeyBuilder) -> ExactMatchFilter<'a> { - kb.push_word(&stemmed_word.stemmed); - ExactMatchFilter{ - iter: iter, - kb: kb, - stemmed_offset: stemmed_word.stemmed_offset as u64, - suffix: stemmed_word.suffix.clone(), - suffix_offset: stemmed_word.suffix_offset as u64, +impl PartialEq for DocResult { + fn eq(&self, other: &DocResult) -> bool { + if self.seq != other.seq { + false + } else { + self.arraypath == other.arraypath } } } -impl<'a> QueryRuntimeFilter for ExactMatchFilter<'a> { - fn first_result(&mut self, start_id: u64) -> Result, Error> { - // Build the full key - self.kb.push_doc_seq(start_id); +impl Eq for DocResult {} - // Seek in index to >= entry - self.iter.seek(SeekKey::from(self.kb.key().as_bytes())); +pub struct QueryScoringInfo { + pub num_terms: usize, + pub sum_of_idt_sqs: f32, +} - // Revert - self.kb.pop_doc_seq(); +pub struct Query {} - self.next_result() - } +impl Query { + pub fn get_matches<'a>(query: &str, index: &'a Index) -> Result, Error> { + if index.rocks.is_none() { + return Err(Error::Parse("You must open the index first".to_string())); + } - fn next_result(&mut self) -> Result, Error> { - let mut doc_result = DocResult::new(); + let snapshot = index.new_snapshot(); + let mut parser = Parser::new(query, snapshot); + let mut filter = try!(parser.build_filter()); + let mut orders = try!(parser.order_clause()); + let mut returnable = try!(parser.return_clause()); + let limit = try!(parser.limit_clause()); + try!(parser.non_ws_left()); + try!(filter.check_double_not(false)); + + if filter.is_all_not() { + return Err(Error::Parse("query cannot be made up of only logical not. Must have at \ + least one match clause not negated." + .to_string())); + } - loop { - if !self.iter.valid() { - return Ok(None) - } + let mut ags = Vec::new(); + returnable.get_aggregate_funs(&mut ags); - // New scope needed as the iter.next() below invalidates the - // current key and value - { - let key = self.iter.key(); - if !key.starts_with(self.kb.key().as_bytes()) { - return Ok(None) - } - let seq = &key[self.kb.key().len()..]; - - let value = self.iter.value(); - // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed - // and why we pass on mutable reference of it to `read_message()` - let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - - for aos_wis in try!(payload.get_arrayoffsets_to_wordinfos()).iter() { - for wi in try!(aos_wis.get_wordinfos()).iter() { - if self.stemmed_offset == wi.get_stemmed_offset() && - self.suffix_offset == wi.get_suffix_offset() && - self.suffix == try!(wi.get_suffix_text()) { - // We have a candidate document to return - let arrayoffsets = try!(aos_wis.get_arrayoffsets()); - doc_result.array_paths.push(arrayoffsets.iter().collect::<>()); - doc_result.seq = str::from_utf8(&seq).unwrap().parse().unwrap(); - break; - } + let mut has_ags = false; + for option_ag in ags.iter() { + if option_ag.is_some() { + has_ags = true; + break; + } + } + let has_ordering = !orders.is_empty(); + + returnable = if has_ordering && has_ags { + return Err(Error::Parse("Cannot have aggregates and ordering in the same query" + .to_string())); + } else if has_ordering { + returnable.take_order_for_matching_fields(&mut orders); + if !orders.is_empty() { + let mut vec: Vec> = Vec::new(); + for (_key, order_info) in orders.into_iter() { + let order = order_info.clone(); + match order_info.field { + OrderField::FetchValue(rp) => { + vec.push(Box::new(RetValue { + rp: rp, + ag: None, + default: order_info.default, + order_info: Some(order), + })); + } + OrderField::Score => { + vec.push(Box::new(RetScore { order_info: Some(order) })); + } } } + Box::new(RetHidden { + unrendered: vec, + visible: returnable, + }) + } else { + returnable } - self.iter.next(); - - if doc_result.seq > 0 { - return Ok(Some(doc_result)); + } else { + returnable + }; + + if has_ags { + // we have at least one AggregationFun. Make sure they are all set. + for option_ag in ags.iter() { + if option_ag.is_none() { + return Err(Error::Parse("Return keypaths must either all have \ + aggregate functions, or none can them." + .to_string())); + } } } - } -} + let needs_ordering_and_ags = has_ags || has_ordering; + + // the input args for orders and ags are vecs where the slot is the same slot as + // a result that the action needs to be applied to. We instead convert them + // into several new fields with tuples of action and the slot to act on. + // this way we don't needlesss loop over the actions where most are noops -struct AndFilter { - filters: Vec>, - array_depth: u64, -} -impl AndFilter { - fn new(filters: Vec>, array_depth: u64) -> AndFilter { - AndFilter { - filters: filters, - array_depth: array_depth, + let mut orders = if has_ordering { + let mut orders = Vec::new(); + let mut ordering = Vec::new(); + returnable.get_ordering(&mut ordering); + let mut n = ordering.len(); + while let Some(option) = ordering.pop() { + n -= 1; + if let Some(order_info) = option { + orders.push((order_info, n)); + } + } + // order we process orders is important + orders.sort_by_key(|&(ref order_info, ref _n)| order_info.order_to_apply); + orders + .into_iter() + .map(|(order_info, n)| (order_info.order, n)) + .collect() + } else { + Vec::new() + }; + + + let mut does_group_or_aggr = false; + let mut aggr_inits = Vec::new(); + let mut aggr_actions = Vec::new(); + let mut aggr_finals = Vec::new(); + if has_ags { + does_group_or_aggr = true; + let mut n = ags.len(); + while let Some(Some((ag, user_arg))) = ags.pop() { + n -= 1; + if ag == AggregateFun::GroupAsc { + orders.push((Order::Asc, n)); + } else if ag == AggregateFun::GroupDesc { + orders.push((Order::Desc, n)); + } else { + let ag_impls = ag.get_fun_impls(); + if let Some(init) = ag_impls.init { + aggr_inits.push((init, n)); + } + if let Some(extract) = ag_impls.extract { + aggr_finals.push((extract, n)); + } + aggr_actions.push((ag_impls.action, user_arg, n)); + } + } + // the order we process groups in important + orders.reverse(); } - } -} -impl QueryRuntimeFilter for AndFilter { - fn first_result(&mut self, start_id: u64) -> Result, Error> { - Ok(None) - } - fn next_result(&mut self) -> Result, Error> { - Ok(None) + let mut qsi = QueryScoringInfo { + num_terms: 0, + sum_of_idt_sqs: 0.0, + }; + + if parser.needs_scoring { + filter.prepare_relevancy_scoring(&mut qsi); + } + + let query_norm = if qsi.num_terms > 0 { + 1.0 / (qsi.sum_of_idt_sqs as f32) + } else { + 0.0 + }; + + Ok(QueryResults { + filter: filter, + doc_result_next: DocResult::new(), + fetcher: parser.snapshot.new_json_fetcher(), + snapshot: parser.snapshot, + returnable: returnable, + needs_ordering_and_ags: needs_ordering_and_ags, + done_with_ordering_and_ags: false, + does_group_or_aggr: does_group_or_aggr, + orders: Some(orders), + aggr_inits: aggr_inits, + aggr_actions: aggr_actions, + aggr_finals: aggr_finals, + in_buffer: Vec::new(), + ordered_buffer: Vec::new(), + limit: limit, + scoring_num_terms: qsi.num_terms, + scoring_query_norm: query_norm, + }) } } - -struct Parser<'a, 'b> { - query: &'a str, - offset: usize, - kb: KeyBuilder, - snapshot: &'b Snapshot<'b>, +pub struct QueryResults<'a> { + filter: Box, + doc_result_next: DocResult, + snapshot: Snapshot<'a>, + fetcher: JsonFetcher, + returnable: Box, + needs_ordering_and_ags: bool, + done_with_ordering_and_ags: bool, + does_group_or_aggr: bool, + orders: Option>, + aggr_inits: Vec<(fn(JsonValue) -> JsonValue, usize)>, + aggr_actions: Vec<(fn(&mut JsonValue, JsonValue, &JsonValue), JsonValue, usize)>, + aggr_finals: Vec<(fn(&mut JsonValue), usize)>, + in_buffer: Vec>, + ordered_buffer: Vec>, + limit: usize, + scoring_num_terms: usize, + scoring_query_norm: f32, } -impl<'a, 'b> Parser<'a, 'b> { - fn new(query: &'a str, snapshot: &'b Snapshot) -> Parser<'a, 'b> { - Parser{ - query: query, - offset: 0, - kb: KeyBuilder::new(), - snapshot: snapshot, +impl<'a> QueryResults<'a> { + fn compute_relevancy_score(&self, dr: &DocResult) -> f32 { + if self.scoring_num_terms == 0 { + return 0.0; } + let mut num_terms_matched = 0; + let mut score: f32 = 0.0; + for &(ref total_term_score, ref num_times_term_matched) in dr.scores.iter() { + if *num_times_term_matched > 0 { + score += *total_term_score / (*num_times_term_matched as f32); + num_terms_matched += 1; + } + } + self.scoring_query_norm * score * (num_terms_matched as f32) / + (self.scoring_num_terms as f32) } - fn whitespace(&mut self) { - loop { - if let Some(char) = self.query[self.offset..].chars().next() { - // Stop when the character isn't a whitespace - if !char.is_whitespace() { - break; - } - self.offset += char.len_utf8(); + fn get_next_result(&mut self) -> Option { + if self.done_with_ordering_and_ags { + return None; + } + let result = self.filter.first_result(&self.doc_result_next); + match result { + Some(doc_result) => { + self.doc_result_next.seq = doc_result.seq + 1; + Some(doc_result) } + None => None, } } - fn consume(&mut self, token: &str) -> bool { - if self.could_consume(token) { - self.offset += token.len(); - true + fn get_next(&mut self) -> Option { + if let Some(doc_result) = self.get_next_result() { + Some(doc_result.seq) } else { - false + None } } - fn could_consume(&mut self, token: &str) -> bool { - self.query[self.offset..].starts_with(token) + pub fn get_next_id(&mut self) -> Option { + let seq = self.get_next(); + match seq { + Some(seq) => { + let key = format!("V{}#._id", seq); + match self.snapshot.get(&key.as_bytes()) { + // If there is an id, it's UTF-8. Strip off type leading byte + Some(id) => Some(id.to_utf8().unwrap()[1..].to_string()), + None => None, + } + } + None => None, + } } - fn consume_field(&mut self) -> Option { - let mut result = String::new(); - for char in self.query[self.offset..].chars() { - if char.is_alphanumeric() { - result.push(char); - } else { - break; + pub fn next_result(&mut self) -> Option { + if self.needs_ordering_and_ags { + loop { + let next = if self.done_with_ordering_and_ags { + None + } else { + self.get_next_result() + }; + match next { + Some(dr) => { + let score = self.compute_relevancy_score(&dr); + let mut results = VecDeque::new(); + self.returnable + .fetch_result(&mut self.fetcher, + dr.seq, + score, + &dr.bind_name_result, + &mut results); + self.in_buffer.push(results); + if self.in_buffer.len() == self.limit { + self.do_ordering_and_ags(); + } + } + None => { + if !self.done_with_ordering_and_ags { + self.do_ordering_and_ags(); + self.done_with_ordering_and_ags = true; + if !self.aggr_finals.is_empty() { + // need to finalize the values + for end in self.ordered_buffer.iter_mut() { + for &(ref finalize, n) in self.aggr_finals.iter() { + (finalize)(&mut end[n]); + } + } + } + } + if let Some(mut results) = self.ordered_buffer.pop() { + return Some(self.returnable.json_result(&mut results)); + } else { + return None; + } + } + } } - } - if result.len() > 0 { - self.offset += result.len(); - self.whitespace(); - Some(result) } else { - None + if self.limit == 0 { + return None; + } + self.limit -= 1; + let dr = match self.get_next_result() { + Some(dr) => dr, + None => return None, + }; + let score = self.compute_relevancy_score(&dr); + let mut results = VecDeque::new(); + self.returnable + .fetch_result(&mut self.fetcher, + dr.seq, + score, + &dr.bind_name_result, + &mut results); + Some(self.returnable.json_result(&mut results)) } } - fn consume_string_literal(&mut self) -> Result, String> { - // Does not unescape yet - let mut lit = String::new(); - if self.consume("\"") { - for char in self.query[self.offset..].chars() { - if char != '"' { - lit.push(char); - } - self.offset += char.len_utf8(); - } - if self.consume("\"") { - self.whitespace(); - Ok(Some(lit)) + fn cmp_results(orders: &Vec<(Order, usize)>, + a: &VecDeque, + b: &VecDeque) + -> Ordering { + for &(ref order_dir, n) in orders.iter() { + let cmp = if *order_dir != Order::Desc { + b[n].cmp(&a[n]) } else { - Err("Expected \"".to_string()) + a[n].cmp(&b[n]) + }; + + if cmp != Ordering::Equal { + return cmp; } - } else { - Ok(None) } + Ordering::Equal } - fn compare(&mut self) -> Result, String> { - match self.consume_field() { - Some(field) => { - if self.consume(".") { - self.kb.push_object_key(field); - let ret = self.compare(); - self.kb.pop_object_key(); - ret - } else if self.consume("=") { - match self.consume_string_literal() { - Ok(Some(literal)) => { - self.kb.push_object_key(field); - - let stems = Stems::new(&literal); - let mut filters: Vec> = Vec::new(); - for stem in stems { - let iter = self.snapshot.iter(); - let filter = Box::new(ExactMatchFilter::new( - iter, &stem, self.kb.clone())); - filters.push(filter); + fn do_ordering_and_ags(&mut self) { + // ugh borrow check madness means this is how this must happen. + // we need to put it back before returning. + let orders = self.orders.take().unwrap(); + if !orders.is_empty() { + self.in_buffer + .sort_by(|a, b| QueryResults::cmp_results(&orders, &a, &b)); + } + // put back + self.orders = Some(orders); + + if !self.does_group_or_aggr { + if self.ordered_buffer.is_empty() { + swap(&mut self.ordered_buffer, &mut self.in_buffer); + } else { + //merge the ordered buffers + let mut new_buffer = Vec::with_capacity(self.ordered_buffer.len() + + self.in_buffer.len()); + let mut option_a = self.ordered_buffer.pop(); + let mut option_b = self.in_buffer.pop(); + // take out for borrow check + let orders = self.orders.take().unwrap(); + loop { + match (option_a, option_b) { + (Some(a), Some(b)) => { + match QueryResults::cmp_results(&orders, &a, &b) { + Ordering::Less => { + new_buffer.push(b); + option_a = Some(a); + option_b = self.in_buffer.pop(); + } + Ordering::Greater => { + new_buffer.push(a); + option_a = self.ordered_buffer.pop(); + option_b = Some(b); + + } + Ordering::Equal => { + new_buffer.push(a); + new_buffer.push(b); + option_a = self.ordered_buffer.pop(); + option_b = self.in_buffer.pop(); + } + } + if new_buffer.len() >= self.limit { + self.ordered_buffer.clear(); + self.in_buffer.clear(); + new_buffer.truncate(self.limit); + break; } + } + (Some(a), None) => { + new_buffer.push(a); + if new_buffer.len() == self.limit { + break; + } + while let Some(a) = self.ordered_buffer.pop() { + new_buffer.push(a); + if new_buffer.len() == self.limit { + break; + } + } + break; + } + (None, Some(b)) => { + new_buffer.push(b); + if new_buffer.len() == self.limit { + break; + } + while let Some(b) = self.in_buffer.pop() { + new_buffer.push(b); + if new_buffer.len() == self.limit { + break; + } + } + break; + } + (None, None) => break, + } + } + // put back + self.orders = Some(orders); + + new_buffer.reverse(); + swap(&mut self.ordered_buffer, &mut new_buffer); + } + return; + } - self.kb.pop_object_key(); - match filters.len() { - 0 => panic!("Cannot create a ExactMatchFilter"), - 1 => Ok(filters[0]), - _ => Ok(Box::new(AndFilter::new( - filters, self.kb.array_depth as u64))), - //_ => Ok(filters[0]), - //_ => Err("just get it compiled".to_string()), + //merge the ordered buffers + let mut new_buffer = Vec::with_capacity(self.ordered_buffer.len() + self.in_buffer.len()); + let mut option_old = self.ordered_buffer.pop(); + let mut option_new = self.in_buffer.pop(); + // take out for borrow check + let orders = self.orders.take().unwrap(); + loop { + match (option_old, option_new) { + (Some(mut old), Some(mut new)) => { + match QueryResults::cmp_results(&orders, &old, &new) { + Ordering::Less => { + for &(ref init, n) in self.aggr_inits.iter() { + // we can't swap out a value of new directly, so this lets us + // without shifting or cloning values, both of which can be + // expensive + let mut new_n = JsonValue::Null; + swap(&mut new_n, &mut new[n]); + new[n] = (init)(new_n); } - }, - // Empty literal - Ok(None) => {Err("not implemetned yet".to_string())}, - Err(error) => { - Err(error) + //push back old value into ordered_buffer, + //then use new value as old value. + self.ordered_buffer.push(old); + option_old = Some(new); + option_new = self.in_buffer.pop(); + } + Ordering::Greater => { + new_buffer.push(old); + option_old = self.ordered_buffer.pop(); + option_new = Some(new); + } + Ordering::Equal => { + for &(ref action, ref user_arg, n) in self.aggr_actions.iter() { + // we can't swap out a value of new directly, so this lets us + // without shifting or cloning values, both of which can be + // expensive + let mut new_n = JsonValue::Null; + swap(&mut new_n, &mut new[n]); + (action)(&mut old[n], new_n, &user_arg); + } + option_old = Some(old); + option_new = self.in_buffer.pop(); } } - } else { - Err("not yet implemented".to_string()) + if new_buffer.len() == self.limit { + self.ordered_buffer.clear(); + self.in_buffer.clear(); + break; + } } - }, - None => { - Err("Expected comparison or array operator".to_string()) + (Some(old), None) => { + new_buffer.push(old); + if new_buffer.len() == self.limit { + break; + } + while let Some(old) = self.ordered_buffer.pop() { + new_buffer.push(old); + if new_buffer.len() == self.limit { + break; + } + } + break; + } + (None, Some(mut new)) => { + for &(ref init, n) in self.aggr_inits.iter() { + // we can't swap out a value of new directly, so this lets us + // without shifting or cloning values, both of which can be + // expensive + let mut new_n = JsonValue::Null; + swap(&mut new_n, &mut new[n]); + new[n] = (init)(new_n); + } + option_old = Some(new); + option_new = self.in_buffer.pop(); + } + (None, None) => break, } } + // put back + self.orders = Some(orders); + + new_buffer.reverse(); + swap(&mut self.ordered_buffer, &mut new_buffer); } - - fn boolean() -> Option { - - None - } +} +impl<'a> Iterator for QueryResults<'a> { + type Item = JsonValue; - fn build_filter(&mut self) -> Result { - self.whitespace(); - Ok(QueryResults{}) + fn next(&mut self) -> Option { + self.next_result() } } -impl Query { - //pub fn get_matches<'a>(query: &str, index: &'a Index) -> Result { - pub fn get_matches<'a, 'b>(query: &str, snapshot: &'b Snapshot) -> Result { - // match &index.rocks { -// &Some(ref rocks) => { -// let snapshot = Snapshot::new(rocks); -// let parser = Parser::new(query, &snapshot); -// Ok(QueryResults{}) -// }, -// &None => { -// Err("You must open the index first".to_string()) -// }, -// } - //let rocks = &index.rocks.unwrap(); - // This one would work as well - //let rocks = index.rocks.as_ref().unwrap(); - //let snapshot = Snapshot::new(rocks); - let parser = Parser::new(query, &snapshot); - Ok(QueryResults{}) - } +#[derive(PartialEq, Eq, Clone)] +pub enum Order { + Asc, + Desc, +} + +#[derive(Clone)] +pub enum OrderField { + FetchValue(ReturnPath), + Score, +} + +#[derive(Clone)] +pub struct OrderInfo { + pub field: OrderField, + pub order_to_apply: usize, + pub order: Order, + pub default: JsonValue, } + + #[cfg(test)] mod tests { - use super::Parser; + extern crate rustc_serialize; - use index::{Index, OpenOptions}; + use super::Query; - use rocksdb::Snapshot; + use index::{Index, OpenOptions, Batch}; #[test] - fn test_whitespace() { + fn test_query_hello_world() { + let dbname = "target/tests/querytestdbhelloworld"; + let _ = Index::drop(dbname); + let mut index = Index::new(); - index.open("test_whitespace", Some(OpenOptions::Create)).unwrap(); - let rocks = &index.rocks.unwrap(); - let snapshot = Snapshot::new(rocks); - - let mut query = " \n \t test"; - let mut parser = Parser::new(query, &snapshot); - parser.whitespace(); - assert_eq!(parser.offset, 5); - - query = "test"; - parser = Parser::new(query, &snapshot); - parser.whitespace(); - assert_eq!(parser.offset, 0); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let mut batch = Batch::new(); + let _ = index.add(r#"{"_id": "foo", "hello": "world"}"#, &mut batch); + index.flush(batch).unwrap(); + + let mut query_results = Query::get_matches(r#"find {hello:=="world"}"#, &index).unwrap(); + println!("query results: {:?}", query_results.get_next_id()); + } + + #[test] + fn test_query_more_docs() { + let dbname = "target/tests/querytestdbmoredocs"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let mut batch = Batch::new(); + for ii in 1..100 { + let data = ((ii % 25) + 97) as u8 as char; + let _ = index.add(&format!(r#"{{"_id":"{}", "data": "{}"}}"#, ii, data), + &mut batch); + } + index.flush(batch).unwrap(); + + let mut query_results = Query::get_matches(r#"find {data: == "u"}"#, &index).unwrap(); + loop { + match query_results.get_next_id() { + Some(result) => println!("result: {}", result), + None => break, + } + } } } diff --git a/src/repl.rs b/src/repl.rs new file mode 100644 index 0000000..07fb1a4 --- /dev/null +++ b/src/repl.rs @@ -0,0 +1,157 @@ +use index::{Index, OpenOptions, Batch}; +use query::Query; +use json_value::{JsonValue, PrettyPrint}; + +use std::io::{Write, BufRead}; +use std::mem; + + +fn is_command(str: &str) -> bool { + let commands = ["find", "add", "create", "drop", "open", "pretty", "commit", "del", "load", + "dumpkeys"]; + for command in commands.iter() { + if str.starts_with(command) { + return true; + } + } + false +} + +pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { + let mut index = Index::new(); + let mut batch = Batch::new(); + let mut lines = String::new(); + let mut pretty = PrettyPrint::new("", "", ""); + loop { + // read in command until we get to a end semi-colon + if r.read_line(&mut lines).unwrap() > 0 { + if test_mode && lines == "\n" || lines.starts_with("#") { + // we preserve blank lines and comments in test mode + w.write_all(lines.as_bytes()).unwrap(); + lines.clear(); + continue; + } + if test_mode && !is_command(&lines) { + // we drop non-command lines + lines.clear(); + continue; + } else if !is_command(&lines) { + w.write_all(b"Unrecognized command!\n").unwrap(); + lines.clear(); + continue; + } + // check for end semi-colon + if !lines.trim_right().ends_with(";") { + while r.read_line(&mut lines).unwrap() > 0 { + // loop until we get the end semi-colon + if lines.trim_right().ends_with(";") { + break; + } + } + } + } else { + // commit anything written + if index.is_open() { + let mut batch2 = Batch::new(); + mem::swap(&mut batch, &mut batch2); + if let Err(reason) = index.flush(batch2) { + write!(w, "{}\n", reason).unwrap(); + } + } + return; + } + if test_mode { + // echo the command + w.write_all(lines.as_bytes()).unwrap(); + } + lines = lines.trim_right().to_string(); + if lines.ends_with(";") { + // strip the semi-colon off + lines.pop(); + } else { + write!(w, "Unterminated command, no semi-colon (;) {}\n", lines).unwrap(); + } + + if lines.starts_with("pretty") { + if lines[6..].trim_left().starts_with("on") { + pretty = PrettyPrint::new(" ", "\n", " "); + } else { + pretty = PrettyPrint::new("", "", ""); + } + } else if lines.starts_with("create") { + let dbname = lines[6..].trim_left(); + match index.open(dbname, Some(OpenOptions::Create)) { + Ok(()) => (), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("drop") { + let dbname = lines[4..].trim_left(); + match Index::drop(dbname) { + Ok(()) => (), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("open") { + let dbname = lines[4..].trim_left(); + match index.open(dbname, None) { + Ok(()) => (), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("dumpkeys") { + match index.all_keys() { + Ok(keys) => { + for key in keys { + write!(w, "{}\n", key).unwrap(); + } + } + Err(reason) => { + write!(w, "{}\n", reason).unwrap(); + } + } + } else if lines.starts_with("add") { + match index.add(&lines[3..], &mut batch) { + Ok(id) => write!(w, "{}\n", JsonValue::str_to_literal(&id)).unwrap(), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("del") { + match index.delete(&lines[3..].trim_left(), &mut batch) { + Ok(true) => write!(w, "ok\n").unwrap(), + Ok(false) => write!(w, "not found\n").unwrap(), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("commit") { + let mut batch2 = Batch::new(); + mem::swap(&mut batch, &mut batch2); + if let Err(reason) = index.flush(batch2) { + write!(w, "{}\n", reason).unwrap(); + } + } else if lines.starts_with("find") { + let mut batch2 = Batch::new(); + mem::swap(&mut batch, &mut batch2); + if let Err(reason) = index.flush(batch2) { + write!(w, "{}\n", reason).unwrap(); + } else { + match Query::get_matches(&lines, &index) { + Ok(results) => { + let mut results = results.peekable(); + + w.write_all(b"[").unwrap(); + if results.peek().is_some() { + w.write_all(b"\n").unwrap(); + } + pretty.push(); + while let Some(json) = results.next() { + json.render(w, &mut pretty).unwrap(); + if results.peek().is_some() { + w.write_all(b",").unwrap(); + } + w.write_all(b"\n").unwrap(); + } + w.write_all(b"]\n").unwrap(); + } + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } + } + lines.clear(); + } +} diff --git a/src/returnable.rs b/src/returnable.rs new file mode 100644 index 0000000..91bca15 --- /dev/null +++ b/src/returnable.rs @@ -0,0 +1,425 @@ + +use std::str; +use std::collections::HashMap; +use std::collections::VecDeque; + +use key_builder::KeyBuilder; +use json_value::JsonValue; +use query::OrderInfo; +use snapshot::JsonFetcher; +use aggregates::AggregateFun; + +#[derive(Clone)] +pub enum PathSegment { + ObjectKey(String), + Array(u64), + ArrayAll, +} + +#[derive(Clone)] +pub struct ReturnPath { + path: Vec, +} + +impl ReturnPath { + pub fn new() -> ReturnPath { + ReturnPath { path: Vec::new() } + } + + pub fn push_object_key(&mut self, key: String) { + self.path.push(PathSegment::ObjectKey(key)); + } + + pub fn push_array(&mut self, index: u64) { + self.path.push(PathSegment::Array(index)); + } + + pub fn push_array_all(&mut self) { + self.path.push(PathSegment::ArrayAll); + } + + pub fn to_key(&self) -> String { + let mut key = String::new(); + for seg in self.path.iter() { + match seg { + &PathSegment::ObjectKey(ref str) => { + key.push('.'); + for cc in str.chars() { + // Escape chars that conflict with delimiters + if "\\$.".contains(cc) { + key.push('\\'); + } + key.push(cc); + } + } + &PathSegment::Array(ref i) => { + key.push('$'); + key.push_str(&i.to_string()); + } + &PathSegment::ArrayAll => { + key.push_str("$*"); + } + } + } + key + } + + pub fn nth(&self, i: usize) -> Option<&PathSegment> { + if self.path.len() <= i { + None + } else { + Some(&self.path[i]) + } + } +} + + + +/// Returnables are created from parsing the return statement in queries. +/// They nest inside of each other, with the outermost typically being a RetObject or RetArray. +pub trait Returnable { + /// When a match is found, information about the match is passed to outer most Returnable + /// and then each nested Returnable will fetch information about the document (fields or + /// scores or bind variables etc) and convert them to JsonValues and add them to the result + /// VecDeque. + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque); + + /// If aggregates are used each Returnable needs to return information about the + /// aggregate function it's using and the default value. + fn get_aggregate_funs(&self, funs: &mut Vec>); + + /// If a query has a order clause then we want to match the fields being ordered with + /// fields being returned. We pass the ordering info by the path of the ordered fields + /// or scores and Returnables that have the same path will take the order + /// information. Any fields not matching a returnable are then added to special hidden + /// Returnable (RetHidden) which fetches those fields for ordering but not rendered or + /// returned. + fn take_order_for_matching_fields(&mut self, map: &mut HashMap); + + /// Each Returnable will return the ordering direction in the same slot as the returnable + /// so that later after fetching they will be ordered by QueryResults after fetching but + /// converting to the final json result. + fn get_ordering(&mut self, orders: &mut Vec>); + + /// This is the final step of a Returnable. The previous fetched JsonValues are now + /// rendered with other ornamental json elements. + fn json_result(&self, results: &mut VecDeque) -> JsonValue; +} + +/// A static Json Object the can contain another number of fields and nested returnables. +pub struct RetObject { + pub fields: Vec<(String, Box)>, +} + +impl Returnable for RetObject { + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) { + for &(ref _key, ref field) in self.fields.iter() { + field.fetch_result(fetcher, seq, score, bind_var_keys, result); + } + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + for &(ref _key, ref field) in self.fields.iter() { + field.get_aggregate_funs(funs); + } + } + + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + for &mut (ref _key, ref mut field) in self.fields.iter_mut() { + field.take_order_for_matching_fields(map); + } + } + + fn get_ordering(&mut self, orders: &mut Vec>) { + for &mut (ref mut _key, ref mut field) in self.fields.iter_mut() { + field.get_ordering(orders); + } + } + + fn json_result(&self, results: &mut VecDeque) -> JsonValue { + let mut vec = Vec::with_capacity(self.fields.len()); + for &(ref key, ref returnable) in self.fields.iter() { + vec.push((key.clone(), returnable.json_result(results))); + } + JsonValue::Object(vec) + } +} + +/// A static Json array the can contain another number of nested Returnables. +pub struct RetArray { + pub slots: Vec>, +} + +impl Returnable for RetArray { + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) { + for ref slot in self.slots.iter() { + slot.fetch_result(fetcher, seq, score, bind_var_keys, result); + } + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + for ref slot in self.slots.iter() { + slot.get_aggregate_funs(funs); + } + } + + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + for slot in self.slots.iter_mut() { + slot.take_order_for_matching_fields(map); + } + } + + fn get_ordering(&mut self, orders: &mut Vec>) { + for ref mut slot in self.slots.iter_mut() { + slot.get_ordering(orders); + } + } + + fn json_result(&self, results: &mut VecDeque) -> JsonValue { + let mut vec = Vec::with_capacity(self.slots.len()); + for slot in self.slots.iter() { + vec.push(slot.json_result(results)); + } + JsonValue::Array(vec) + } +} + +/// A special returnable that only fetches values for later ordering but never renders +/// them back to the caller. +pub struct RetHidden { + pub unrendered: Vec>, + pub visible: Box, +} + +impl Returnable for RetHidden { + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) { + for ref unrendered in self.unrendered.iter() { + unrendered.fetch_result(fetcher, seq, score, bind_var_keys, result); + } + + self.visible + .fetch_result(fetcher, seq, score, bind_var_keys, result); + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + self.visible.get_aggregate_funs(funs); + } + + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + self.visible.take_order_for_matching_fields(map); + } + + fn get_ordering(&mut self, orders: &mut Vec>) { + for ref mut unrendered in self.unrendered.iter_mut() { + unrendered.get_ordering(orders); + } + + self.visible.get_ordering(orders); + } + + fn json_result(&self, results: &mut VecDeque) -> JsonValue { + for _n in 0..self.unrendered.len() { + // we already ordered at this point, now discard the values + results.pop_front(); + } + self.visible.json_result(results) + } +} + +/// A literal JsonValue. Number, String, Null, True or False. Just in case the query +/// wants to return something that doesn't come from a document. +pub struct RetLiteral { + pub json: JsonValue, +} + +impl Returnable for RetLiteral { + fn fetch_result(&self, + _fetcher: &mut JsonFetcher, + _seq: u64, + _score: f32, + _bind_var_keys: &HashMap>, + _result: &mut VecDeque) { + } + + fn get_aggregate_funs(&self, _funs: &mut Vec>) { + //noop + } + + fn take_order_for_matching_fields(&mut self, _map: &mut HashMap) { + //noop + } + + fn get_ordering(&mut self, _orders: &mut Vec>) { + //noop + } + + fn json_result(&self, _results: &mut VecDeque) -> JsonValue { + self.json.clone() + } +} + +/// A value from a document. It knows the path it wants to fetch and loads the value from the +/// stored original document. +pub struct RetValue { + pub rp: ReturnPath, + pub ag: Option<(AggregateFun, JsonValue)>, + pub default: JsonValue, + pub order_info: Option, +} + + + +impl Returnable for RetValue { + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + _score: f32, + _bind_var_keys: &HashMap>, + result: &mut VecDeque) { + if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { + //don't fetch anything for count(). just stick in a null + result.push_back(JsonValue::Null); + return; + } + let mut kb = KeyBuilder::new(); + if let Some(json) = fetcher.fetch(seq, &mut kb, &self.rp) { + result.push_back(json); + } else { + result.push_back(self.default.clone()); + } + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + funs.push(self.ag.clone()); + } + + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + self.order_info = map.remove(&self.rp.to_key()); + } + + fn get_ordering(&mut self, orders: &mut Vec>) { + orders.push(self.order_info.take()); + } + + fn json_result(&self, results: &mut VecDeque) -> JsonValue { + if let Some(json) = results.pop_front() { + json + } else { + panic!("missing result!"); + } + } +} + +/// A bind variable. If a bind variable was matched it will be fetched then it's path is +/// added to the bind_var_keys passed into fetch_result(). This will load the values from the +/// original document and return it. +pub struct RetBind { + pub bind_name: String, + pub extra_rp: ReturnPath, + pub ag: Option<(AggregateFun, JsonValue)>, + pub default: JsonValue, + pub order_info: Option, +} + +impl Returnable for RetBind { + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + _score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) { + + if let Some(value_keys) = bind_var_keys.get(&self.bind_name) { + let mut array = Vec::with_capacity(value_keys.len()); + for base_key in value_keys { + let mut kb = KeyBuilder::new(); + kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&base_key)); + + if let Some(json) = fetcher.fetch(seq, &mut kb, &self.extra_rp) { + array.push(json); + } else { + array.push(self.default.clone()); + } + } + result.push_back(JsonValue::Array(array)); + } else { + result.push_back(JsonValue::Array(vec![self.default.clone()])); + } + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + funs.push(self.ag.clone()); + } + + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + self.order_info = map.remove(&(self.bind_name.to_string() + &self.extra_rp.to_key())); + } + + fn get_ordering(&mut self, orders: &mut Vec>) { + orders.push(self.order_info.take()); + } + + fn json_result(&self, results: &mut VecDeque) -> JsonValue { + if let Some(json) = results.pop_front() { + json + } else { + panic!("missing bind result!"); + } + } +} + +/// Returns a relevency score for a match. +pub struct RetScore { + pub order_info: Option, +} + +impl Returnable for RetScore { + fn fetch_result(&self, + _fetcher: &mut JsonFetcher, + _seq: u64, + score: f32, + _bind_var_keys: &HashMap>, + result: &mut VecDeque) { + result.push_back(JsonValue::Number(score as f64)); + } + + fn get_aggregate_funs(&self, _funs: &mut Vec>) { + // noop + } + + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + self.order_info = map.remove("score()"); + } + + fn get_ordering(&mut self, orders: &mut Vec>) { + orders.push(self.order_info.take()); + } + + fn json_result(&self, results: &mut VecDeque) -> JsonValue { + if let Some(json) = results.pop_front() { + json + } else { + panic!("missing score result!"); + } + } +} diff --git a/src/snapshot.rs b/src/snapshot.rs new file mode 100644 index 0000000..9408691 --- /dev/null +++ b/src/snapshot.rs @@ -0,0 +1,430 @@ +use rocksdb::{self, DBIterator, Snapshot as RocksSnapshot, IteratorMode}; + +extern crate varint; + +use std::io::Cursor; +use std::str; +use std::mem::transmute; +use std::iter::Peekable; +use std::f32; + +use key_builder::{KeyBuilder, Segment}; +use query::{DocResult, QueryScoringInfo}; +use index::Index; +use returnable::{PathSegment, ReturnPath}; +use json_value::JsonValue; +use self::varint::VarintRead; + + +pub struct Snapshot<'a> { + rocks: RocksSnapshot<'a>, +} + +impl<'a> Snapshot<'a> { + pub fn new(rocks: RocksSnapshot) -> Snapshot { + Snapshot { rocks: rocks } + } + + pub fn new_term_doc_result_iterator(&self, term: &str, kb: &KeyBuilder) -> DocResultIterator { + DocResultIterator { + iter: self.rocks.iterator(IteratorMode::Start), + keypathword: kb.get_keypathword_only(&term), + } + + } + + pub fn get(&self, key: &[u8]) -> Option { + self.rocks.get(key).unwrap() + } + + pub fn new_scorer(&self, term: &str, kb: &KeyBuilder, boost: f32) -> Scorer { + Scorer { + iter: self.rocks.iterator(IteratorMode::Start), + idf: f32::NAN, + boost: boost, + kb: kb.clone(), + term: term.to_string(), + term_ordinal: 0, + } + } + + pub fn new_json_fetcher(&self) -> JsonFetcher { + JsonFetcher { iter: self.rocks.iterator(IteratorMode::Start) } + } + + pub fn new_iterator(&self) -> DBIterator { + self.rocks.iterator(IteratorMode::Start) + } + + pub fn new_all_docs_iterator(&self) -> AllDocsIterator { + let mut iter = self.rocks.iterator(IteratorMode::Start); + iter.set_mode(IteratorMode::From(b"S", rocksdb::Direction::Forward)); + AllDocsIterator { iter: iter } + } +} + +pub struct DocResultIterator { + iter: DBIterator, + keypathword: String, +} + +impl DocResultIterator { + pub fn advance_gte(&mut self, start: &DocResult) { + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); + // Seek in index to >= entry + self.iter + .set_mode(IteratorMode::From(self.keypathword.as_bytes(), rocksdb::Direction::Forward)); + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); + } + + pub fn next(&mut self) -> Option<(DocResult, TermPositions)> { + if let Some((key, value)) = self.iter.next() { + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return None; + } + + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + let dr = KeyBuilder::parse_doc_result_from_key(&key_str); + + Some((dr, TermPositions { pos: value.into_vec() })) + } else { + None + } + } +} + + +pub struct TermPositions { + pos: Vec, +} + +impl TermPositions { + pub fn positions(self) -> Vec { + let mut bytes = Cursor::new(self.pos); + let mut positions = Vec::new(); + while let Ok(pos) = bytes.read_unsigned_varint_32() { + positions.push(pos); + } + positions + } +} + +pub struct Scorer { + iter: DBIterator, + idf: f32, + boost: f32, + kb: KeyBuilder, + term: String, + term_ordinal: usize, +} + +impl Scorer { + pub fn init(&mut self, qsi: &mut QueryScoringInfo) { + let key = self.kb.keypathword_count_key(&self.term); + let doc_freq = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 + } else { + 0.0 + }; + + let key = self.kb.keypath_count_key(); + let num_docs = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 + } else { + 0.0 + }; + + self.idf = 1.0 + (num_docs / (doc_freq + 1.0)).ln(); + self.term_ordinal = qsi.num_terms; + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += self.idf * self.idf; + } + + pub fn get_value(&mut self, key: &str) -> Option> { + self.iter + .set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); + if let Some((ret_key, ret_value)) = self.iter.next() { + if ret_key.len() == key.len() && ret_key.starts_with(key.as_bytes()) { + Some(ret_value) + } else { + None + } + } else { + None + } + } + + pub fn add_match_score(&mut self, num_matches: u32, dr: &mut DocResult) { + if self.should_score() { + let key = self.kb.field_length_key_from_doc_result(dr); + let total_field_words = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 + } else { + panic!("Couldn't find field length for a match!! WHAT!"); + }; + + let tf: f32 = (num_matches as f32).sqrt(); + let norm = 1.0 / (total_field_words as f32).sqrt(); + let score = self.idf * self.idf * tf * norm * self.boost; + dr.add_score(self.term_ordinal, score); + } + } + + pub fn should_score(&self) -> bool { + !self.idf.is_nan() + } +} + + +pub struct JsonFetcher { + iter: DBIterator, +} + +impl JsonFetcher { + pub fn fetch(&mut self, + seq: u64, + mut kb_base: &mut KeyBuilder, + rp: &ReturnPath) + -> Option { + JsonFetcher::descend_return_path(&mut self.iter, seq, &mut kb_base, &rp, 0) + } + + pub fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { + match bytes[0] as char { + 's' => { + let string = unsafe { str::from_utf8_unchecked(&bytes[1..]) }.to_string(); + JsonValue::String(string) + } + 'f' => { + assert!(bytes.len() == 9); + let mut bytes2: [u8; 8] = [0; 8]; + for (n, b) in bytes[1..9].iter().enumerate() { + bytes2[n] = *b; + } + let double: f64 = unsafe { transmute(bytes2) }; + JsonValue::Number(double) + } + 'T' => JsonValue::True, + 'F' => JsonValue::False, + 'N' => JsonValue::Null, + 'o' => JsonValue::Object(vec![]), + 'a' => JsonValue::Array(vec![]), + what => panic!("unexpected type tag in value: {}", what), + } + } + + fn return_array(mut array: Vec<(u64, JsonValue)>) -> JsonValue { + array.sort_by_key(|tuple| tuple.0); + JsonValue::Array(array.into_iter().map(|(_i, json)| json).collect()) + } + + fn descend_return_path(iter: &mut DBIterator, + seq: u64, + kb: &mut KeyBuilder, + rp: &ReturnPath, + mut rp_index: usize) + -> Option { + + while let Some(segment) = rp.nth(rp_index) { + rp_index += 1; + match segment { + &PathSegment::ObjectKey(ref string) => { + kb.push_object_key(string); + } + &PathSegment::ArrayAll => { + let mut i = 0; + let mut vec = Vec::new(); + loop { + kb.push_array_index(i); + i += 1; + if let Some(json) = JsonFetcher::descend_return_path(iter, + seq, + &mut kb.clone(), + rp, + rp_index) { + vec.push(json); + kb.pop_array(); + } else { + // we didn't get a value, is it because the array ends or the + // full path isn't there? check as there might be more array elements + // with a full path that does match. + let value_key = kb.value_key(seq); + kb.pop_array(); + + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + if let Some((key, _value)) = iter.next() { + if key.starts_with(value_key.as_bytes()) { + // yes it exists. loop again. + continue; + } + } + + if vec.is_empty() { + return None; + } else { + return Some(JsonValue::Array(vec)); + } + } + } + } + &PathSegment::Array(ref index) => { + kb.push_array_index(*index); + } + } + } + + let value_key = kb.value_key(seq); + + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); + + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => return None, + }; + + if !KeyBuilder::is_keypath_prefix(&value_key, unsafe { str::from_utf8_unchecked(&key) }) { + return None; + } + Some(JsonFetcher::do_fetch(&mut iter.peekable(), &value_key, key, value)) + } + + /// When do_fetch is called it means we know we are going to find a value because + /// we prefix matched the keypath. What we are doing here is parsing the remaining + /// keypath to figure out the nested structure of the remaining keypath. So we + /// depth first recursively parse the keypath and return the value and inserting into + /// containers (arrays or objects) then iterate keys until the keypath no longer matches. + fn do_fetch(iter: &mut Peekable<&mut DBIterator>, + value_key: &str, + mut key: Box<[u8]>, + mut value: Box<[u8]>) + -> JsonValue { + + if key.len() == value_key.len() { + // we have a key match! + return JsonFetcher::bytes_to_json_value(value.as_ref()); + } + let segment = { + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + let remaining = &key_str[value_key.len()..]; + KeyBuilder::parse_first_key_value_segment(&remaining) + }; + + match segment { + Some((Segment::ObjectKey(mut unescaped), escaped)) => { + let mut object: Vec<(String, JsonValue)> = Vec::new(); + + let mut value_key_next = value_key.to_string() + &escaped; + loop { + let json_val = JsonFetcher::do_fetch(iter, &value_key_next, key, value); + object.push((unescaped, json_val)); + + let segment = match iter.peek() { + Some(&(ref k, ref _v)) => { + let key = unsafe { str::from_utf8_unchecked(k) }; + if !KeyBuilder::is_keypath_prefix(value_key, key) { + return JsonValue::Object(object); + } + + let key_str = unsafe { str::from_utf8_unchecked(&k) }; + let remaining = &key_str[value_key.len()..]; + + KeyBuilder::parse_first_key_value_segment(&remaining) + } + None => return JsonValue::Object(object), + }; + + if let Some((Segment::ObjectKey(unescaped2), escaped2)) = segment { + unescaped = unescaped2; + // advance the peeked iter + match iter.next() { + Some((k, v)) => { + key = k; + value = v; + } + None => panic!("couldn't advanced already peeked iter"), + }; + value_key_next.truncate(value_key.len()); + value_key_next.push_str(&escaped2); + } else { + return JsonValue::Object(object); + } + } + } + Some((Segment::Array(mut i), escaped)) => { + // we use a tuple with ordinal because we encounter + // elements in lexical sorting order instead of ordinal order + let mut array: Vec<(u64, JsonValue)> = Vec::new(); + + let mut value_key_next = value_key.to_string() + &escaped; + loop { + let json_val = JsonFetcher::do_fetch(iter, &value_key_next, key, value); + array.push((i, json_val)); + + let segment = match iter.peek() { + Some(&(ref k, ref _v)) => { + let key = unsafe { str::from_utf8_unchecked(k) }; + if !KeyBuilder::is_keypath_prefix(value_key, key) { + return JsonFetcher::return_array(array); + } + + let key_str = unsafe { str::from_utf8_unchecked(&k) }; + let remaining = &key_str[value_key.len()..]; + + KeyBuilder::parse_first_key_value_segment(&remaining) + } + None => return JsonFetcher::return_array(array), + }; + + if let Some((Segment::Array(i2), escaped2)) = segment { + i = i2; + // advance the already peeked iter + match iter.next() { + Some((k, v)) => { + key = k; + value = v; + } + None => panic!("couldn't advanced already peeked iter"), + }; + value_key_next.truncate(value_key.len()); + value_key_next.push_str(&escaped2); + } else { + return JsonFetcher::return_array(array); + } + } + } + None => { + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + panic!("somehow couldn't parse key segment {} {}", + value_key, + key_str); + } + } + } +} + +pub struct AllDocsIterator { + iter: DBIterator, +} + +impl AllDocsIterator { + pub fn next(&mut self) -> Option { + match self.iter.next() { + Some((k, _v)) => { + let key = unsafe { str::from_utf8_unchecked(&k) }; + if let Some(seq) = KeyBuilder::parse_seq_key(key) { + let mut dr = DocResult::new(); + dr.seq = seq; + Some(dr) + } else { + None + } + } + None => None, + } + } +} diff --git a/src/stems.rs b/src/stems.rs index 5946509..ee55a20 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -2,74 +2,85 @@ extern crate stemmer; extern crate unicode_normalization; extern crate unicode_segmentation; - use self::stemmer::Stemmer; use self::unicode_normalization::UnicodeNormalization; use self::unicode_segmentation::UnicodeSegmentation; pub struct Stems<'a> { - words: unicode_segmentation::UnicodeWordIndices<'a>, + words: unicode_segmentation::UWordBoundIndices<'a>, stemmer: Stemmer, + word_position: usize, } #[derive(Debug, PartialEq)] pub struct StemmedWord { // Where the stemmed word starts - pub stemmed_offset: usize, - // Where the suffix starts - pub suffix_offset: usize, + pub word_pos: u32, // The stemmed word pub stemmed: String, - // The difference between the stemmed word and the original lowercased one. It can be - // used to recontruct the original word (for exact match searches) - pub suffix: String, } impl<'a> Stems<'a> { pub fn new(text: &str) -> Stems { - Stems{ - words: text.unicode_word_indices(), + Stems { + words: text.split_word_bound_indices(), stemmer: Stemmer::new("english").unwrap(), + word_position: 0, } } - - /// Return the *byte* length of the common prefix between two strings - fn common_prefix_len(aa: &str, bb: &str) -> usize { - let mut count = 0; - for (charsa, charsb) in aa.chars().zip(bb.chars()) { - if charsa != charsb { - break; - } - count += charsa.len_utf8(); - } - count - } } impl<'a> Iterator for Stems<'a> { type Item = StemmedWord; fn next(&mut self) -> Option { - match self.words.next() { - Some((pos, word)) => { - let lowercased = word.to_lowercase(); - let normalized = lowercased.nfkc().collect::(); - let stemmed = self.stemmer.stem(&normalized); - let prefix_len = Stems::common_prefix_len(&stemmed, &normalized); - let ret = StemmedWord { - stemmed_offset: pos, - suffix_offset: pos + prefix_len, - stemmed: stemmed, - suffix: (&normalized[prefix_len..normalized.len()]).to_string(), - }; - Some(ret) - }, - None => None + // we loop though until we find alphabetic chars. That becomes our stem word. + let mut non_alpha = String::new(); // will contain any non-alphabetic chars + // returned iff no other alphabetic chars + while let Some((_pos, word)) = self.words.next() { + let normalized = word.nfkc().collect::(); + if normalized.chars().next().unwrap().is_alphabetic() { + let pos = self.word_position; + self.word_position += 1; + return Some(StemmedWord { + word_pos: pos as u32, + stemmed: self.stemmer.stem(&normalized.to_lowercase()), + }); + } else { + if self.word_position == 0 { + non_alpha.push_str(&normalized); + } + } + } + if non_alpha.is_empty() { + if self.word_position == 0 { + self.word_position = 1; + // in this case we were passed an empty string + // so we don't just return None, but we return + // an empty string Stemmed word. + // otherwise searching fields for empty strings + // wouldn't be possible. + return Some(StemmedWord { + word_pos: 0, + stemmed: String::new(), + }); + } else { + return None; + } + } else { + if self.word_position == 0 { + self.word_position = 1; + return Some(StemmedWord { + word_pos: 0, + stemmed: non_alpha, + }); + } else { + return None; + } } } - } @@ -77,54 +88,39 @@ impl<'a> Iterator for Stems<'a> { mod tests { use super::{StemmedWord, Stems}; - #[test] - fn test_stems_lowercase() { - let input = "These words deeply test smoothly that stemming"; - let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 5, - stemmed: String::from("these"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 6, suffix_offset: 10, - stemmed: String::from("word"), suffix: String::from("s") }, - // "deeply" stems to "deepli" - StemmedWord { stemmed_offset: 12, suffix_offset: 17, - stemmed: String::from("deepli"), suffix: String::from("y") }, - StemmedWord { stemmed_offset: 19, suffix_offset: 23, - stemmed: String::from("test"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 24, suffix_offset: 30, - stemmed: String::from("smooth"), suffix: String::from("ly") }, - StemmedWord { stemmed_offset: 33, suffix_offset: 37, - stemmed: String::from("that"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 38, suffix_offset: 42, - stemmed: String::from("stem"), suffix: String::from("ming") }, - ]; - assert_eq!(result.len(), expected.len()); - for (stem, expected_stem) in result.iter().zip(expected.iter()) { - assert_eq!(stem, expected_stem); - } - } - #[test] fn test_stems_mixedcase() { let input = "THEse Words deeplY test smOOthly that stemmING"; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 5, - stemmed: String::from("these"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 6, suffix_offset: 10, - stemmed: String::from("word"), suffix: String::from("s") }, - // "deeply" stems to "deepli" - StemmedWord { stemmed_offset: 12, suffix_offset: 17, - stemmed: String::from("deepli"), suffix: String::from("y") }, - StemmedWord { stemmed_offset: 19, suffix_offset: 23, - stemmed: String::from("test"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 24, suffix_offset: 30, - stemmed: String::from("smooth"), suffix: String::from("ly") }, - StemmedWord { stemmed_offset: 33, suffix_offset: 37, - stemmed: String::from("that"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 38, suffix_offset: 42, - stemmed: String::from("stem"), suffix: String::from("ming") }, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("these"), + }, + StemmedWord { + word_pos: 1, + stemmed: String::from("word"), + }, + // "deeply" stems to "deepli" + StemmedWord { + word_pos: 2, + stemmed: String::from("deepli"), + }, + StemmedWord { + word_pos: 3, + stemmed: String::from("test"), + }, + StemmedWord { + word_pos: 4, + stemmed: String::from("smooth"), + }, + StemmedWord { + word_pos: 5, + stemmed: String::from("that"), + }, + StemmedWord { + word_pos: 6, + stemmed: String::from("stem"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -135,19 +131,25 @@ mod tests { fn test_stems_nonchars() { let input = " @#$!== \t+-"; let result = Stems::new(input).collect::>(); - assert_eq!(result.len(), 0); + assert_eq!(result, + vec![StemmedWord { + word_pos: 0, + stemmed: String::from(" @#$!== \t+-"), + }]); } #[test] fn test_stems_some_nonchars() { let input = "@!? Let's seeing..."; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { stemmed_offset: 6, suffix_offset: 9, - stemmed: String::from("let"), suffix: String::from("'s") }, - StemmedWord { stemmed_offset: 12, suffix_offset: 15, - stemmed: String::from("see"), suffix: String::from("ing") }, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("let"), + }, + StemmedWord { + word_pos: 1, + stemmed: String::from("see"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -158,12 +160,32 @@ mod tests { fn test_stems_unicode() { let input = "Ünicöde stemming"; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 8, - stemmed: String::from("ünicöd"), suffix: String::from("e") }, - StemmedWord { stemmed_offset: 10, suffix_offset: 14, - stemmed: String::from("stem"), suffix: String::from("ming") }, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("ünicöd"), + }, + StemmedWord { + word_pos: 1, + stemmed: String::from("stem"), + }]; + assert_eq!(result.len(), expected.len()); + for (stem, expected_stem) in result.iter().zip(expected.iter()) { + assert_eq!(stem, expected_stem); + } + } + + #[test] + fn test_stems_trailing_needs_normalized() { + let input = r#"Didgeridoos™"#; + let result = Stems::new(input).collect::>(); + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("didgeridoo"), + }, + StemmedWord { + word_pos: 1, + stemmed: String::from("tm"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -174,10 +196,10 @@ mod tests { fn test_stems_unicode_lowercase_has_more_bytes() { let input = "İ"; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 3, - stemmed: String::from("i̇"), suffix: String::from("") }, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("i̇"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -201,7 +223,8 @@ mod tests { // let upper = "Ρ̓"; // let lower = "ῤ"; // println!("lower({}) == {}: {}", upper, lower, upper.to_lowercase() == lower); - // println!("lower({}) == lower(upper({})): {}", upper, lower, upper.to_lowercase() == lower.to_uppercase().to_lowercase()); + // println!("lower({}) == lower(upper({})): {}", upper, lower, upper.to_lowercase() == + // lower.to_uppercase().to_lowercase()); // lower(Ρ̓) == ῤ: false // lower(Ρ̓) == lower(upper(ῤ)): true #[test] @@ -209,38 +232,13 @@ mod tests { // The input is: Ρ̓ῤῤ (11 bytes), lowercases is ῤῤῤ (9 bytes) let input = "\u{03A1}\u{0313}\u{03C1}\u{0313}\u{1FE4}"; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 9, - stemmed: String::from("\u{1FE4}\u{1FE4}\u{1FE4}"), suffix: String::from("") }, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("\u{03C1}\u{0313}\u{1FE4}\u{1FE4}"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); } } - - #[test] - fn test_common_prefix_len() { - let tests = vec![ - ("a", "a", 1), - ("ab", "a", 1), - ("a", "ab", 1), - ("ab", "ab", 2), - ("a", "b", 0), - ("b", "a", 0), - ("ab", "cd", 0), - ("ab", "bc", 0), - ("abc", "abd", 2), - ("ac", "abcd", 1), - (" a", "a", 0), - ("a", "a ", 1), - ("xyzabc", "xyz", 3), - ("xyz", "xyzabc", 3), - ("öxyz", "öx", 3), - ]; - for (aa, bb, expected) in tests { - let prefix_len = Stems::common_prefix_len(aa, bb); - assert_eq!(prefix_len, expected); - } - } } diff --git a/tests/repl_tests.rs b/tests/repl_tests.rs new file mode 100644 index 0000000..50f1b8a --- /dev/null +++ b/tests/repl_tests.rs @@ -0,0 +1,80 @@ +extern crate noise_search; + +use std::io::{Read, Write, BufReader}; +use std::fs::{self, File}; +use std::env; + +use noise_search::repl::repl; + +#[test] +fn test_repl() { + // We load up tests scripts from repl-tests and evaluate them. The output should be idenitical + // to the test script files. If not, then the test is failed and a new file is written with + // .reject extension in the same directory where it can be investigated. + + // To update the test files with new command and output, simply edit/add commands and run + // update-test-repl.sh script from the project root directory. Then examin or do a git diff to + // see if the output is as expected. + + let mut test_dir = env::current_dir().unwrap(); + test_dir.push("repl-tests"); + let mut failures = 0; + let mut total = 0; + // Sort files by last modified date to make debugging easier + let mut entries: Vec<_> = fs::read_dir(test_dir) + .unwrap() + .map(|r| r.unwrap()) + .collect(); + entries.sort_by_key(|entry| entry.metadata().unwrap().modified().unwrap()); + for entry in entries { + let mut path = entry.path(); + if path.extension().unwrap().to_str().unwrap() != "noise" { + continue; + } + total += 1; + let test_name = path.file_name() + .unwrap() + .to_str() + .unwrap() + .to_string(); + println!("About to run test {} ", test_name); + let mut file = File::open(path.clone()).unwrap(); + let mut file_buffer = Vec::new(); + file.read_to_end(&mut file_buffer).unwrap(); + + let mut test_result_buffer = Vec::new(); + let file = File::open(path.clone()).unwrap(); + repl(&mut BufReader::new(file), &mut test_result_buffer, true); + + if file_buffer != test_result_buffer { + failures += 1; + path.set_extension("reject"); + let reject = path.file_name() + .unwrap() + .to_str() + .unwrap() + .to_string(); + + let mut file = File::create(path.clone()).unwrap(); + file.write_all(&test_result_buffer).unwrap(); + file.sync_all().unwrap(); + + println!("Repl test {} failure. Failing output written to {} in repl-tests dir.", + test_name, + reject); + } else { + println!("{} successful", + path.file_name() + .unwrap() + .to_str() + .unwrap() + .to_string()); + } + } + if total == 0 { + panic!("No tests were run!"); + } + if failures > 0 { + panic!("Failed {} tests in repl-test out of {}", failures, total); + } +} diff --git a/tests/rocksdb.rs b/tests/rocksdb.rs index 8c9fe3c..29a5e45 100644 --- a/tests/rocksdb.rs +++ b/tests/rocksdb.rs @@ -1,5 +1,5 @@ extern crate rocksdb; -use rocksdb::{DB, Writable}; +use rocksdb::DB; #[test] fn rocksdb_works() { diff --git a/update-test-repl.sh b/update-test-repl.sh new file mode 100755 index 0000000..31c2489 --- /dev/null +++ b/update-test-repl.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# This file runs the tests + +SCRIPTPATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +DIRNAME="$(dirname ${SCRIPTPATH})" + +NOISE="${DIRNAME}/target/debug/noise_search" +REPL_TEST_DIR="${DIRNAME}/repl-tests" +cargo build +exit_status=$? +if [ $exit_status -ne 0 ]; then + exit $exit_status +fi +if [[ ! -f "${NOISE}" ]]; then + echo "Can't find noise binary, looked at ${NOISE}" + exit 1 +fi + +REPL_TESTS="${REPL_TEST_DIR}/*.noise" +for f in $REPL_TESTS +do + echo -n "Testing: ${f}..." + RUST_BACKTRACE=1 "${NOISE}" -t < "${f}" > "${f}.out" + echo "updating." + cp "${f}.out" "${f}" + rm "${f}.out" +done + +echo "Updated tests. Use \`\`git diff ./repl-tests\`\` to review the changes."