001 (ns cc.journeyman.speechio.canonicalise
002 "Replace words and phrases in provided input with preferred equivalents
003 drawn from a thesaurus.
004
005 Broadly, the problem with allowing unconstrained
006 input is that the words that users choose to use for things we actually
007 know about may not be our preferred words. This namespace is intended to
008 provide a mechanism to canonicalise phrases and words in the input.
009
010 The input is expected to be a lispified Stanford NLP parse tree; the
011 output is expected to be a new parse tree, with the difference being that
012 the returned tree may have nodes marked :AMBIG, whose children are different
013 possible canonical interpretations."
014 (:import [clojure.lang Atom PersistentArrayMap]))
015
016 (defprotocol Thesaurus
017 (lookup [this speech-part] "Return the canonical expression equivalent to
018 this `speech-part`, or just this `speech-part`
019 if none are found. `speech-part` is expected to
020 be a lispified (see `as-lisp`) Stanford NLP parse
021 tree.
022
023 If multiple canonical equivalents are found, return
024 a new node whose `:label` value is `:AMBIG` and whose
025 `:children` value is a list of the possible
026 equivalents."))
027
028 (extend Atom
029 Thesaurus
030 ;; It's highly likely that in practice we're going to want to add to
031 ;; thesauri, and, at least during development, to do so without restarting.
032 ;; So it's probably best to be able to look 'through' an atom, without
033 ;; worrying too much about what that atom is bound to.
034 {:lookup (fn [this speechpart]
035 (lookup (deref this) speechpart))})
036
037 (extend PersistentArrayMap
038 Thesaurus
039 {:lookup (fn [this speechpart]
040 (let [k (if (keyword? speechpart) speechpart (str speechpart))
041 l (:label speechpart)
042 v (this k)]
043 (cond v v
044 (map? speechpart)
045 (assoc speechpart
046 :label (or (this l) l)
047 :children (mapv #(lookup this %)
048 (:children speechpart)))
049 :else speechpart)))})
050
051 (def default-thesaurus-content
052 "For testing and development only!"
053 {;; we'll assume that when a user talks about a smith without other
054 ;; qualification they mean a sword smith.
055 :smith :swordsmith
056 ;; we'll assume that any blacksmith can make a sword (although perhaps not
057 ;; a good one). This is obviously not a good assumption if the user wants
058 ;; their horse shod, but this thesaurus is just to play with ideas.
059 :blacksmith :swordsmith
060 ;; the canonical form of `swordsmith` is not hyphenated.
061 :sword-smith :swordsmith
062 ;; if the user says "the sword smith", we'll get a noun phrase
063 "{:label :NP, :children [{:label :DT, :children [{:label :the, :children []}]} {:label :NN, :children [{:label :sword, :children []}]} {:label :NN, :children [{:label :smith, :children []}]}]}" :swordsmith})
064
065 (def test-thesaurus (atom default-thesaurus-content))
066