|
class Analyzer |
|
|
|
def initialize |
|
@expansions = [] |
|
@transformations = [] |
|
@substitutions = {} |
|
@tokenizer = lambda { |string| string.split } |
|
end |
|
|
|
def tokenizer(&proc) |
|
@tokenizer = proc |
|
end |
|
|
|
def expansion(cost=0.0, &proc) |
|
@expansions << [cost, proc] |
|
end |
|
|
|
def substitution(input, output) |
|
@substitutions[input] = output |
|
end |
|
|
|
alias_method :sub, :substitution |
|
|
|
def transformation(&proc) |
|
@transformations << proc |
|
end |
|
|
|
def tokenize(string) |
|
@tokenizer.call(string) |
|
end |
|
|
|
def process_token(token) |
|
@transformations.each do |proc| |
|
token = proc.call(token) |
|
end |
|
if out = @substitutions[token] |
|
token = out |
|
end |
|
variants = {} |
|
@expansions.each do |cost, proc| |
|
if variant = proc.call(token) |
|
variants[variant] = cost |
|
end |
|
end |
|
variants.size > 0 ? [token, variants] : token |
|
end |
|
|
|
def analyze(string) |
|
tokenize(string).map { |token| process_token(token) } |
|
end |
|
|
|
end |
|
|
|
describe "An Analyzer" do |
|
|
|
before do |
|
@analyzer = Analyzer.new |
|
end |
|
|
|
it "can take a custom tokenizer" do |
|
@analyzer.tokenizer { |string| string.split(/\s+/) } |
|
@analyzer.tokenize("three blind mice").should == %w{three blind mice} |
|
|
|
@analyzer.tokenizer { |string| string.scan(/[\w']+/) } |
|
@analyzer.tokenize("joe's bait-shop").should == %w{joe's bait shop} |
|
end |
|
|
|
it "can perform weighted term expansions" do |
|
@analyzer.expansion(0.5) { |word| word.tr( "'", "") if word =~ /'/ } |
|
@analyzer.expansion(0.5) { |word| word.chomp("'s") if word =~ /'s$/ } |
|
|
|
@analyzer.process_token("joe's").should == ["joe's", {"joe" => 0.5, "joes" => 0.5}] |
|
@analyzer.process_token("boring").should == "boring" |
|
end |
|
|
|
it "can transform terms" do |
|
@analyzer.transformation { |word| word.reverse } |
|
@analyzer.process_token("123").should == "321" |
|
end |
|
|
|
it "can substitute terms" do |
|
@analyzer.substitution("&", "and") |
|
@analyzer.process_token("&").should == "and" |
|
end |
|
|
|
it "expands terms after substitutions" do |
|
@analyzer.expansion { |word| "ampersand" if word == "and" } |
|
@analyzer.substitution("&", "and") |
|
|
|
@analyzer.process_token("&").should == ["and", {"ampersand" => 0.0}] |
|
end |
|
|
|
it "substitutes after transformations" do |
|
@analyzer.substitution("joe", "joseph") |
|
@analyzer.transformation { |word| word.tr('m', 'j') } |
|
|
|
@analyzer.process_token("moe").should == "joseph" |
|
end |
|
|
|
it "does phrases, if you know how to Enumerable#map" do |
|
@analyzer.sub("&", "and") |
|
@analyzer.expansion(0.5) { |word| word.tr( "'", "") if word =~ /'/ } |
|
@analyzer.expansion(0.5) { |word| word.chomp("'s") if word =~ /'s$/ } |
|
@analyzer.expansion(3.0) { |word| word.split('-') if word =~ /-/ } |
|
@analyzer.expansion(0.1) { |word| word.tr('-', '') if word =~ /-/ } |
|
|
|
orig = "joe's sushi & bait-shop shack" |
|
analyzed = [ |
|
["joe's", {"joe" => 0.5, "joes" => 0.5}], |
|
"sushi", |
|
"and", |
|
["bait-shop", {"baitshop" => 0.1, ["bait", "shop"] => 3.0}], |
|
"shack" |
|
] |
|
@analyzer.analyze(orig).should == analyzed |
|
end |
|
|
|
end |