diff --git a/spec/cadmium/lemmatizer.cr b/spec/cadmium/lemmatizer.cr new file mode 100644 index 0000000..9ddf56f --- /dev/null +++ b/spec/cadmium/lemmatizer.cr @@ -0,0 +1 @@ +require "./lemmatizer/*" diff --git a/spec/cadmium/lemmatizer/wordnet_spec.cr b/spec/cadmium/lemmatizer/wordnet_spec.cr new file mode 100644 index 0000000..c006b69 --- /dev/null +++ b/spec/cadmium/lemmatizer/wordnet_spec.cr @@ -0,0 +1,25 @@ +require "../../spec_helper" + +describe Cadmium::Lemmatizer::WordNetLemmatizer do + subject = Cadmium::Lemmatizer::WordNetLemmatizer + + it "should preform lemmatization" do + subject.lemmatize("dogs").should eq("dog") + subject.lemmatize("churches").should eq("church") + subject.lemmatize("aardwolves").should eq("aardwolf") + subject.lemmatize("abaci").should eq("abacus") + subject.lemmatize("hardrock").should eq("hardrock") + end + + it "should lemmatize with String#lemmatize" do + "dogs".lemmatize.should eq("dog") + end + + it "should tokenize and lemmatize with String#tokenize_and_lemmatize" do + "My dogs are very fun TO play with And another thing, he is A poodle.".tokenize_and_lemmatize.should eq(["dog", "fun", "play", "poodle"]) + end + + it "should tokenize and lemmatize including stopwords" do + "My dog is very fun TO play with And another thing, he is A poodle.".tokenize_and_lemmatize(keep_stops: true).should eq(["my", "dog", "be", "very", "fun", "to", "play", "with", "and", "another", "thing", "he", "be", "a", "poodle"]) + end +end diff --git a/src/cadmium/lemmatizer.cr b/src/cadmium/lemmatizer.cr new file mode 100644 index 0000000..ff3fe2f --- /dev/null +++ b/src/cadmium/lemmatizer.cr @@ -0,0 +1,13 @@ +require "./lemmatizer/*" + +module Cadmium + module StringExtension + def lemmatize(lemmatizer = Cadmium::Lemmatizer::WordNetLemmatizer) + lemmatizer.lemmatize(self) + end + + def tokenize_and_lemmatize(lemmatizer = Cadmium::Lemmatizer::WordNetLemmatizer, keep_stops = false) + lemmatizer.tokenize_and_lemmatize(self, keep_stops) + end + end +end diff --git a/src/cadmium/lemmatizer/wordnet.cr b/src/cadmium/lemmatizer/wordnet.cr new file mode 100644 index 0000000..910a44d --- /dev/null +++ b/src/cadmium/lemmatizer/wordnet.cr @@ -0,0 +1,35 @@ +require "../wordnet" +require "../i18n/stop_words" +require "../tokenizer/aggressive_tokenizer" + +module Cadmium + module Lemmatizer + module WordNetLemmatizer + include Cadmium::I18n::StopWords + + def self.lemmatize(token, pos : Symbol | String? = nil) + lemmas = if pos + WordNet.morphy(token, pos) + else + WordNet.morphy(token) + end + return token if lemmas.empty? + lemmas.min_by(&.size) + end + + def self.tokenize_and_lemmatize(text, keep_stops = false) + lemmatized_tokens = [] of String + lowercase_text = text.downcase + tokens = Cadmium::AggressiveTokenizer.new.tokenize(lowercase_text) + + if keep_stops + tokens.each { |token| lemmatized_tokens.push(lemmatize(token)) } + else + tokens.each { |token| lemmatized_tokens.push(lemmatize(token)) unless @@stop_words.includes?(token) } + end + + lemmatized_tokens + end + end + end +end