xapian: Add phrase-aware tokenizer.
authorRicardo Wurmus <rekado@elephly.net>
Sun, 10 May 2020 06:28:38 +0000 (08:28 +0200)
committerRicardo Wurmus <rekado@elephly.net>
Sun, 10 May 2020 06:28:38 +0000 (08:28 +0200)
* mumi/xapian.scm (tokenize): New procedure.
(search): Use it instead of string-tokenize.
* tests/xapian.scm: Test it.

mumi/xapian.scm
tests/xapian.scm

index 5cb5ceeb9ba0d3e16ba23b32e967263766a4a107..cc5997d350fc07f47ea9a3b30ace9700b12c6e79 100644 (file)
@@ -205,6 +205,35 @@ messages and index their contents in the Xapian database at DBPATH."
       (delete-QueryParser queryparser)
       query)))
 
+(define (tokenize querystring)
+  "Split QUERYSTRING at word boundaries, but keep quoted phrases
+intact."
+  (let ((intermediate
+         (string-fold (lambda (char result)
+                        (match result
+                          ;; Phrase!
+                          ((#t previous rest)
+                           (list (not (eq? char #\")) ; end of phrase?
+                                 (cons char previous)
+                                 rest))
+                          ;; Everything else
+                          ((#f previous rest)
+                           (if (eq? char #\space)
+                               ;; end of word
+                               (list #f '()
+                                     (cons (apply string (reverse previous))
+                                           rest))
+                               ;; continue word
+                               (list (eq? char #\")
+                                     (cons char previous) rest)))))
+                      '(#f () ())
+                      querystring)))
+    ;; The last word is still just a bunch of characters.
+    (match intermediate
+      ((_ last query)
+       (reverse (cons (apply string (reverse last))
+                      query))))))
+
 (define* (search querystring #:key
                  (pagesize 1000)
                  (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
@@ -218,7 +247,7 @@ messages and index their contents in the Xapian database at DBPATH."
                                           (string-prefix? "mdate:" token))
                                       (sanitize-date-range token)
                                       token))
-                                (string-tokenize querystring))))
+                                (tokenize querystring))))
              ;; Parse querystring passing a stemmer and suitable
             ;; prefixes for field search.
             (query (parse-query* querystring*
index 607fdb88b3e064f1b3775deffb6fa36783ced3b8..04d1a4d0dc2035c11ae3ee90e5a836c296334d40 100644 (file)
@@ -68,4 +68,15 @@ given by REPLACEMENT."
             (time->datestamp 1m)
             (time->datestamp today))))
 
+(define tokenize
+  (@@ (mumi xapian) tokenize))
+
+(test-equal "tokenize: keeps phrases intact 1"
+  (tokenize "subject:\"hello world\" how are you")
+  '("subject:\"hello world\"" "how" "are" "you"))
+
+(test-equal "tokenize: keeps phrases intact 2"
+  (tokenize "subject:\"hello world\" how \"are\" you")
+  '("subject:\"hello world\"" "how" "\"are\"" "you"))
+
 (test-end "xapian")