The idea is to take a phrase and analyze it for use in Information Retrieval. We need to tokenize it into words, possibly transmute some of the tokens, possibly expand some tokens into subphrases. This class lets you register lambdas to perform transformations, substitutions, and expansions. Expansions can take a numerical value representing the cost of the operation; this is intended for raising or lowering the scores of matches in the theoretical IR application.
Given the phrase “joe’s sushi & bait-shop shack”, assume I want to tokenize on whitespace, replace the ampersand with the word “and”, and create word variants for the hyphenized and apostrophized words. See the last spec for an example of the Ruby data structure this class generates.
class Analyzer | |
def initialize | |
@expansions = [] | |
@transformations = [] | |
@substitutions = {} | |
@tokenizer = lambda { |string| string.split } | |
end | |
def tokenizer(&proc) | |
@tokenizer = proc | |
end | |
def expansion(cost=0.0, &proc) | |
@expansions << [cost, proc] | |
end | |
def substitution(input, output) | |
@substitutions[input] = output | |
end | |
alias_method :sub, :substitution | |
def transformation(&proc) | |
@transformations << proc | |
end | |
def tokenize(string) | |
@tokenizer.call(string) | |
end | |
def process_token(token) | |
@transformations.each do |proc| | |
token = proc.call(token) | |
end | |
if out = @substitutions[token] | |
token = out | |
end | |
variants = {} | |
@expansions.each do |cost, proc| | |
if variant = proc.call(token) | |
variants[variant] = cost | |
end | |
end | |
variants.size > 0 ? [token, variants] : token | |
end | |
def analyze(string) | |
tokenize(string).map { |token| process_token(token) } | |
end | |
end | |
describe "An Analyzer" do | |
before do | |
@analyzer = Analyzer.new | |
end | |
it "can take a custom tokenizer" do | |
@analyzer.tokenizer { |string| string.split(/\s+/) } | |
@analyzer.tokenize("three blind mice").should == %w{three blind mice} | |
@analyzer.tokenizer { |string| string.scan(/[\w']+/) } | |
@analyzer.tokenize("joe's bait-shop").should == %w{joe's bait shop} | |
end | |
it "can perform weighted term expansions" do | |
@analyzer.expansion(0.5) { |word| word.tr( "'", "") if word =~ /'/ } | |
@analyzer.expansion(0.5) { |word| word.chomp("'s") if word =~ /'s$/ } | |
@analyzer.process_token("joe's").should == ["joe's", {"joe" => 0.5, "joes" => 0.5}] | |
@analyzer.process_token("boring").should == "boring" | |
end | |
it "can transform terms" do | |
@analyzer.transformation { |word| word.reverse } | |
@analyzer.process_token("123").should == "321" | |
end | |
it "can substitute terms" do | |
@analyzer.substitution("&", "and") | |
@analyzer.process_token("&").should == "and" | |
end | |
it "expands terms after substitutions" do | |
@analyzer.expansion { |word| "ampersand" if word == "and" } | |
@analyzer.substitution("&", "and") | |
@analyzer.process_token("&").should == ["and", {"ampersand" => 0.0}] | |
end | |
it "substitutes after transformations" do | |
@analyzer.substitution("joe", "joseph") | |
@analyzer.transformation { |word| word.tr('m', 'j') } | |
@analyzer.process_token("moe").should == "joseph" | |
end | |
it "does phrases, if you know how to Enumerable#map" do | |
@analyzer.sub("&", "and") | |
@analyzer.expansion(0.5) { |word| word.tr( "'", "") if word =~ /'/ } | |
@analyzer.expansion(0.5) { |word| word.chomp("'s") if word =~ /'s$/ } | |
@analyzer.expansion(3.0) { |word| word.split('-') if word =~ /-/ } | |
@analyzer.expansion(0.1) { |word| word.tr('-', '') if word =~ /-/ } | |
orig = "joe's sushi & bait-shop shack" | |
analyzed = [ | |
["joe's", {"joe" => 0.5, "joes" => 0.5}], | |
"sushi", | |
"and", | |
["bait-shop", {"baitshop" => 0.1, ["bait", "shop"] => 3.0}], | |
"shack" | |
] | |
@analyzer.analyze(orig).should == analyzed | |
end | |
end |