diff options
-rw-r--r-- | mumi/xapian.scm | 31 | ||||
-rw-r--r-- | tests/xapian.scm | 11 |
2 files changed, 41 insertions, 1 deletions
diff --git a/mumi/xapian.scm b/mumi/xapian.scm index 5cb5cee..cc5997d 100644 --- a/mumi/xapian.scm +++ b/mumi/xapian.scm @@ -205,6 +205,35 @@ messages and index their contents in the Xapian database at DBPATH." (delete-QueryParser queryparser) query))) +(define (tokenize querystring) + "Split QUERYSTRING at word boundaries, but keep quoted phrases +intact." + (let ((intermediate + (string-fold (lambda (char result) + (match result + ;; Phrase! + ((#t previous rest) + (list (not (eq? char #\")) ; end of phrase? + (cons char previous) + rest)) + ;; Everything else + ((#f previous rest) + (if (eq? char #\space) + ;; end of word + (list #f '() + (cons (apply string (reverse previous)) + rest)) + ;; continue word + (list (eq? char #\") + (cons char previous) rest))))) + '(#f () ()) + querystring))) + ;; The last word is still just a bunch of characters. + (match intermediate + ((_ last query) + (reverse (cons (apply string (reverse last)) + query)))))) + (define* (search querystring #:key (pagesize 1000) (dbpath (string-append (%config 'db-dir) "/mumi.xapian"))) @@ -218,7 +247,7 @@ messages and index their contents in the Xapian database at DBPATH." (string-prefix? "mdate:" token)) (sanitize-date-range token) token)) - (string-tokenize querystring)))) + (tokenize querystring)))) ;; Parse querystring passing a stemmer and suitable ;; prefixes for field search. (query (parse-query* querystring* diff --git a/tests/xapian.scm b/tests/xapian.scm index 607fdb8..04d1a4d 100644 --- a/tests/xapian.scm +++ b/tests/xapian.scm @@ -68,4 +68,15 @@ given by REPLACEMENT." (time->datestamp 1m) (time->datestamp today)))) +(define tokenize + (@@ (mumi xapian) tokenize)) + +(test-equal "tokenize: keeps phrases intact 1" + (tokenize "subject:\"hello world\" how are you") + '("subject:\"hello world\"" "how" "are" "you")) + +(test-equal "tokenize: keeps phrases intact 2" + (tokenize "subject:\"hello world\" how \"are\" you") + '("subject:\"hello world\"" "how" "\"are\"" "you")) + (test-end "xapian") |