summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRicardo Wurmus <rekado@elephly.net>2020-05-10 08:28:38 +0200
committerRicardo Wurmus <rekado@elephly.net>2020-05-10 08:28:38 +0200
commitb6f26c77e1e6756909e47d143ff2a1766454c475 (patch)
treea7205c654b0a3e56362c809b9b768a62d1fd4a8b
parent43a931ace99e154124b855b5468ea7b10582bfdb (diff)
xapian: Add phrase-aware tokenizer.
* mumi/xapian.scm (tokenize): New procedure. (search): Use it instead of string-tokenize. * tests/xapian.scm: Test it.
-rw-r--r--mumi/xapian.scm31
-rw-r--r--tests/xapian.scm11
2 files changed, 41 insertions, 1 deletions
diff --git a/mumi/xapian.scm b/mumi/xapian.scm
index 5cb5cee..cc5997d 100644
--- a/mumi/xapian.scm
+++ b/mumi/xapian.scm
@@ -205,6 +205,35 @@ messages and index their contents in the Xapian database at DBPATH."
(delete-QueryParser queryparser)
query)))
+(define (tokenize querystring)
+ "Split QUERYSTRING at word boundaries, but keep quoted phrases
+intact."
+ (let ((intermediate
+ (string-fold (lambda (char result)
+ (match result
+ ;; Phrase!
+ ((#t previous rest)
+ (list (not (eq? char #\")) ; end of phrase?
+ (cons char previous)
+ rest))
+ ;; Everything else
+ ((#f previous rest)
+ (if (eq? char #\space)
+ ;; end of word
+ (list #f '()
+ (cons (apply string (reverse previous))
+ rest))
+ ;; continue word
+ (list (eq? char #\")
+ (cons char previous) rest)))))
+ '(#f () ())
+ querystring)))
+ ;; The last word is still just a bunch of characters.
+ (match intermediate
+ ((_ last query)
+ (reverse (cons (apply string (reverse last))
+ query))))))
+
(define* (search querystring #:key
(pagesize 1000)
(dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
@@ -218,7 +247,7 @@ messages and index their contents in the Xapian database at DBPATH."
(string-prefix? "mdate:" token))
(sanitize-date-range token)
token))
- (string-tokenize querystring))))
+ (tokenize querystring))))
;; Parse querystring passing a stemmer and suitable
;; prefixes for field search.
(query (parse-query* querystring*
diff --git a/tests/xapian.scm b/tests/xapian.scm
index 607fdb8..04d1a4d 100644
--- a/tests/xapian.scm
+++ b/tests/xapian.scm
@@ -68,4 +68,15 @@ given by REPLACEMENT."
(time->datestamp 1m)
(time->datestamp today))))
+(define tokenize
+ (@@ (mumi xapian) tokenize))
+
+(test-equal "tokenize: keeps phrases intact 1"
+ (tokenize "subject:\"hello world\" how are you")
+ '("subject:\"hello world\"" "how" "are" "you"))
+
+(test-equal "tokenize: keeps phrases intact 2"
+ (tokenize "subject:\"hello world\" how \"are\" you")
+ '("subject:\"hello world\"" "how" "\"are\"" "you"))
+
(test-end "xapian")