summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRicardo Wurmus <rekado@elephly.net>2020-04-22 10:16:22 +0200
committerRicardo Wurmus <rekado@elephly.net>2020-04-22 10:16:22 +0200
commit33e5a5786dec56b691b1c6db59d607694f77ea68 (patch)
treec8ca736cd990c44d159bbb9ac827e47ea80cfdf7
parent3b7871b740046c14448ac87eea958423fb01959a (diff)
mumi: Add (mumi xapian).
* mumi/xapian.scm: New file. * Makefile.am (SOURCES): Add it.
-rw-r--r--Makefile.am3
-rw-r--r--mumi/xapian.scm178
2 files changed, 180 insertions, 1 deletions
diff --git a/Makefile.am b/Makefile.am
index aeeffee..1f54042 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -49,5 +49,6 @@ SOURCES = \
mumi/jobs.scm \
mumi/send-email.scm \
mumi/config.scm \
- mumi/debbugs.scm
+ mumi/debbugs.scm \
+ mumi/xapian.scm
diff --git a/mumi/xapian.scm b/mumi/xapian.scm
new file mode 100644
index 0000000..f7573c7
--- /dev/null
+++ b/mumi/xapian.scm
@@ -0,0 +1,178 @@
+;;; mumi -- Mediocre, uh, mail interface
+;;; Copyright © 2020 Ricardo Wurmus <rekado@elephly.net>
+;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
+;;;
+;;; This program is free software: you can redistribute it and/or
+;;; modify it under the terms of the GNU Affero General Public License
+;;; as published by the Free Software Foundation, either version 3 of
+;;; the License, or (at your option) any later version.
+;;;
+;;; This program is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;; Affero General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU Affero General Public
+;;; License along with this program. If not, see
+;;; <http://www.gnu.org/licenses/>.
+
+(define-module (mumi xapian)
+ #:use-module (mumi config)
+ #:use-module (mumi debbugs)
+ #:use-module (ice-9 format)
+ #:use-module (ice-9 match)
+ #:use-module (ice-9 threads)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-11)
+ #:use-module (srfi srfi-19)
+ #:use-module (srfi srfi-26)
+ #:use-module (email email)
+ #:use-module (xapian wrap)
+ #:use-module (xapian xapian)
+ #:use-module (rnrs bytevectors)
+ #:export (index search))
+
+(define (parse-emails files)
+ (n-par-map 4
+ (lambda (file)
+ (list file
+ (call-with-input-file file
+ read-emails-from-bug-log)))
+ files))
+
+(define (author email)
+ (or (and=> (assoc-ref (email-headers email) 'from)
+ (match-lambda
+ ((from . rest)
+ (format #f "~a ~a"
+ (assoc-ref from 'name)
+ (assoc-ref from 'address)))
+ (from
+ (format #f "~a ~a"
+ (assoc-ref from 'name)
+ (assoc-ref from 'address)))))
+ "<unknown>"))
+
+(define* (index files #:key
+ (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
+ (define (index-chunk files+mails)
+ (call-with-writable-database
+ dbpath
+ (lambda (db)
+ (for-each
+ (match-lambda
+ ((file (? pair? emails))
+ (let* ((bugid (basename file ".log"))
+ (report (first emails))
+ (submitter (author report))
+ (subjects
+ (string-join
+ (delete-duplicates
+ (filter-map (lambda (email)
+ (assoc-ref (email-headers email) 'subject))
+ emails))))
+ (authors
+ (string-join (delete-duplicates (map author emails)) " "))
+ (date
+ (match (assoc-ref (email-headers report) 'date)
+ ((? string? s) #f)
+ (date date)))
+ (mdate
+ (match (assoc-ref (email-headers (last emails)) 'date)
+ ((? string? s) #f)
+ (date date)))
+ (text
+ (string-join (map (match-lambda
+ ((($ <mime-entity> headers body) ...)
+ (string-join (filter string? body) "\n"))
+ ((? bytevector? bv) (utf8->string bv))
+ ((? string? s) s))
+ (map email-body emails))
+ "\n"))
+
+ (idterm (string-append "Q" bugid))
+ (doc (make-document #:data bugid
+ #:terms `((,idterm . 0))
+ #:values
+ `((0 . ,(or (and date (date->string date "~Y~m~d")) "19700101"))
+ (1 . ,(or (and mdate (date->string mdate "~Y~m~d")) "19700101")))))
+ (term-generator (make-term-generator #:stem (make-stem "en")
+ #:document doc)))
+ ;; Index fields with a suitable prefix. This allows for
+ ;; searching separate fields as in subject:foo,
+ ;; from:bar, etc.
+ (index-text! term-generator bugid #:prefix "B")
+ (index-text! term-generator submitter #:prefix "A")
+ (index-text! term-generator authors #:prefix "XA")
+ (index-text! term-generator subjects #:prefix "S")
+ (index-text! term-generator file #:prefix "F")
+
+ ;; Index subject and body without prefixes for general
+ ;; search.
+ (index-text! term-generator subjects)
+ (increase-termpos! term-generator)
+ (index-text! term-generator text)
+
+ ;; Add the document to the database. The unique idterm
+ ;; ensures each object ends up in the database only once
+ ;; no matter how many times we run the indexer.
+ (replace-document! db idterm doc)))
+ (_ #f)) ; ignore
+ files+mails))))
+
+ (let ((total (length files)))
+ (let loop ((files files))
+ (let-values (((chunk rest)
+ (if (>= (length files) 100)
+ (split-at files 100)
+ (values files '()))))
+ (format (current-error-port)
+ "indexing ~1,2f%~%"
+ (exact->inexact (* 100 (/ (- total (length rest)) total))))
+ (index-chunk (parse-emails chunk))
+ (unless (null? rest)
+ (loop rest))))))
+
+(define* (parse-query* querystring #:key stemmer stemming-strategy (prefixes '()))
+ (let ((queryparser (new-QueryParser))
+ (date-range-processor (new-DateRangeProcessor 0 "date:" 0))
+ (mdate-range-processor (new-DateRangeProcessor 1 "mdate:" 0)))
+ (QueryParser-set-stemmer queryparser stemmer)
+ (when stemming-strategy
+ (QueryParser-set-stemming-strategy queryparser stemming-strategy))
+ (for-each (match-lambda
+ ((field . prefix)
+ (QueryParser-add-prefix queryparser field prefix)))
+ prefixes)
+ (QueryParser-add-rangeprocessor queryparser date-range-processor)
+ (QueryParser-add-rangeprocessor queryparser mdate-range-processor)
+ (let ((query (QueryParser-parse-query queryparser querystring)))
+ (delete-QueryParser queryparser)
+ query)))
+
+(define* (search querystring #:key
+ (pagesize 10)
+ (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
+ ;; Open database for reading. call-with-database automatically
+ ;; closes the database once we're done.
+ (call-with-database dbpath
+ (lambda (db)
+ (let (;; Parse querystring passing a stemmer and suitable
+ ;; prefixes for field search.
+ (query (parse-query* querystring
+ #:stemmer (make-stem "en")
+ #:prefixes '(("submitter" . "A")
+ ("author" . "XA")
+ ("subject" . "S")))))
+ ;; Fold over the results, return bug id.
+ (mset-fold (lambda (item acc)
+ (cons
+ (document-data (mset-item-document item))
+ acc))
+ '()
+ ;; Get an Enquire object from the database with the
+ ;; search results. Then, extract the MSet from the
+ ;; Enquire object.
+ (enquire-mset (enquire db query)
+ #:maximum-items pagesize))))))
+