mumi: Add (mumi xapian).
authorRicardo Wurmus <rekado@elephly.net>
Wed, 22 Apr 2020 08:16:22 +0000 (10:16 +0200)
committerRicardo Wurmus <rekado@elephly.net>
Wed, 22 Apr 2020 08:16:22 +0000 (10:16 +0200)
* mumi/xapian.scm: New file.
* Makefile.am (SOURCES): Add it.

Makefile.am
mumi/xapian.scm [new file with mode: 0644]

index aeeffee..1f54042 100644 (file)
@@ -49,5 +49,6 @@ SOURCES =                                                     \
   mumi/jobs.scm                                                \
   mumi/send-email.scm                          \
   mumi/config.scm                                      \
-  mumi/debbugs.scm
+  mumi/debbugs.scm                                     \
+  mumi/xapian.scm
 
diff --git a/mumi/xapian.scm b/mumi/xapian.scm
new file mode 100644 (file)
index 0000000..f7573c7
--- /dev/null
@@ -0,0 +1,178 @@
+;;; mumi -- Mediocre, uh, mail interface
+;;; Copyright © 2020 Ricardo Wurmus <rekado@elephly.net>
+;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
+;;;
+;;; This program is free software: you can redistribute it and/or
+;;; modify it under the terms of the GNU Affero General Public License
+;;; as published by the Free Software Foundation, either version 3 of
+;;; the License, or (at your option) any later version.
+;;;
+;;; This program is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;;; Affero General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU Affero General Public
+;;; License along with this program.  If not, see
+;;; <http://www.gnu.org/licenses/>.
+
+(define-module (mumi xapian)
+  #:use-module (mumi config)
+  #:use-module (mumi debbugs)
+  #:use-module (ice-9 format)
+  #:use-module (ice-9 match)
+  #:use-module (ice-9 threads)
+  #:use-module (srfi srfi-1)
+  #:use-module (srfi srfi-11)
+  #:use-module (srfi srfi-19)
+  #:use-module (srfi srfi-26)
+  #:use-module (email email)
+  #:use-module (xapian wrap)
+  #:use-module (xapian xapian)
+  #:use-module (rnrs bytevectors)
+  #:export (index search))
+
+(define (parse-emails files)
+  (n-par-map 4
+             (lambda (file)
+               (list file
+                     (call-with-input-file file
+                       read-emails-from-bug-log)))
+             files))
+
+(define (author email)
+  (or (and=> (assoc-ref (email-headers email) 'from)
+             (match-lambda
+               ((from . rest)
+                (format #f "~a ~a"
+                        (assoc-ref from 'name)
+                        (assoc-ref from 'address)))
+               (from
+                (format #f "~a ~a"
+                        (assoc-ref from 'name)
+                        (assoc-ref from 'address)))))
+      "<unknown>"))
+
+(define* (index files #:key
+                (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
+  (define (index-chunk files+mails)
+    (call-with-writable-database
+     dbpath
+     (lambda (db)
+       (for-each
+        (match-lambda
+          ((file (? pair? emails))
+           (let* ((bugid (basename file ".log"))
+                  (report (first emails))
+                  (submitter (author report))
+                  (subjects
+                   (string-join
+                    (delete-duplicates
+                     (filter-map (lambda (email)
+                                   (assoc-ref (email-headers email) 'subject))
+                                 emails))))
+                  (authors
+                   (string-join (delete-duplicates (map author emails)) " "))
+                  (date
+                   (match (assoc-ref (email-headers report) 'date)
+                     ((? string? s) #f)
+                     (date date)))
+                  (mdate
+                   (match (assoc-ref (email-headers (last emails)) 'date)
+                     ((? string? s) #f)
+                     (date date)))
+                  (text
+                   (string-join (map (match-lambda
+                                       ((($ <mime-entity> headers body) ...)
+                                        (string-join (filter string? body) "\n"))
+                                       ((? bytevector? bv) (utf8->string bv))
+                                       ((? string? s) s))
+                                     (map email-body emails))
+                                "\n"))
+
+                  (idterm (string-append "Q" bugid))
+                  (doc (make-document #:data bugid
+                                      #:terms `((,idterm . 0))
+                                      #:values
+                                      `((0 . ,(or (and date (date->string date "~Y~m~d")) "19700101"))
+                                        (1 . ,(or (and mdate (date->string mdate "~Y~m~d")) "19700101")))))
+                  (term-generator (make-term-generator #:stem (make-stem "en")
+                                                       #:document doc)))
+             ;; Index fields with a suitable prefix. This allows for
+             ;; searching separate fields as in subject:foo,
+             ;; from:bar, etc.
+             (index-text! term-generator bugid #:prefix "B")
+             (index-text! term-generator submitter #:prefix "A")
+             (index-text! term-generator authors #:prefix "XA")
+             (index-text! term-generator subjects #:prefix "S")
+             (index-text! term-generator file #:prefix "F")
+
+             ;; Index subject and body without prefixes for general
+             ;; search.
+             (index-text! term-generator subjects)
+             (increase-termpos! term-generator)
+             (index-text! term-generator text)
+
+             ;; Add the document to the database. The unique idterm
+             ;; ensures each object ends up in the database only once
+             ;; no matter how many times we run the indexer.
+             (replace-document! db idterm doc)))
+          (_ #f))                       ; ignore
+        files+mails))))
+
+  (let ((total (length files)))
+    (let loop ((files files))
+      (let-values (((chunk rest)
+                    (if (>= (length files) 100)
+                        (split-at files 100)
+                        (values files '()))))
+        (format (current-error-port)
+                "indexing ~1,2f%~%"
+                (exact->inexact (* 100 (/ (- total (length rest)) total))))
+        (index-chunk (parse-emails chunk))
+        (unless (null? rest)
+          (loop rest))))))
+
+(define* (parse-query* querystring #:key stemmer stemming-strategy (prefixes '()))
+  (let ((queryparser (new-QueryParser))
+        (date-range-processor (new-DateRangeProcessor 0 "date:" 0))
+        (mdate-range-processor (new-DateRangeProcessor 1 "mdate:" 0)))
+    (QueryParser-set-stemmer queryparser stemmer)
+    (when stemming-strategy
+      (QueryParser-set-stemming-strategy queryparser stemming-strategy))
+    (for-each (match-lambda
+                ((field . prefix)
+                 (QueryParser-add-prefix queryparser field prefix)))
+              prefixes)
+    (QueryParser-add-rangeprocessor queryparser date-range-processor)
+    (QueryParser-add-rangeprocessor queryparser mdate-range-processor)
+    (let ((query (QueryParser-parse-query queryparser querystring)))
+      (delete-QueryParser queryparser)
+      query)))
+
+(define* (search querystring #:key
+                 (pagesize 10)
+                 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
+  ;; Open database for reading. call-with-database automatically
+  ;; closes the database once we're done.
+  (call-with-database dbpath
+    (lambda (db)
+      (let (;; Parse querystring passing a stemmer and suitable
+            ;; prefixes for field search.
+            (query (parse-query* querystring
+                                 #:stemmer (make-stem "en")
+                                 #:prefixes '(("submitter" . "A")
+                                              ("author" . "XA")
+                                              ("subject" . "S")))))
+        ;; Fold over the results, return bug id.
+        (mset-fold (lambda (item acc)
+                     (cons
+                      (document-data (mset-item-document item))
+                      acc))
+                   '()
+                   ;; Get an Enquire object from the database with the
+                   ;; search results. Then, extract the MSet from the
+                   ;; Enquire object.
+                   (enquire-mset (enquire db query)
+                                 #:maximum-items pagesize))))))
+