diff options
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | mumi/xapian.scm | 178 |
2 files changed, 180 insertions, 1 deletions
diff --git a/Makefile.am b/Makefile.am index aeeffee..1f54042 100644 --- a/Makefile.am +++ b/Makefile.am @@ -49,5 +49,6 @@ SOURCES = \ mumi/jobs.scm \ mumi/send-email.scm \ mumi/config.scm \ - mumi/debbugs.scm + mumi/debbugs.scm \ + mumi/xapian.scm diff --git a/mumi/xapian.scm b/mumi/xapian.scm new file mode 100644 index 0000000..f7573c7 --- /dev/null +++ b/mumi/xapian.scm @@ -0,0 +1,178 @@ +;;; mumi -- Mediocre, uh, mail interface +;;; Copyright © 2020 Ricardo Wurmus <rekado@elephly.net> +;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net> +;;; +;;; This program is free software: you can redistribute it and/or +;;; modify it under the terms of the GNU Affero General Public License +;;; as published by the Free Software Foundation, either version 3 of +;;; the License, or (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;; Affero General Public License for more details. +;;; +;;; You should have received a copy of the GNU Affero General Public +;;; License along with this program. If not, see +;;; <http://www.gnu.org/licenses/>. + +(define-module (mumi xapian) + #:use-module (mumi config) + #:use-module (mumi debbugs) + #:use-module (ice-9 format) + #:use-module (ice-9 match) + #:use-module (ice-9 threads) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-11) + #:use-module (srfi srfi-19) + #:use-module (srfi srfi-26) + #:use-module (email email) + #:use-module (xapian wrap) + #:use-module (xapian xapian) + #:use-module (rnrs bytevectors) + #:export (index search)) + +(define (parse-emails files) + (n-par-map 4 + (lambda (file) + (list file + (call-with-input-file file + read-emails-from-bug-log))) + files)) + +(define (author email) + (or (and=> (assoc-ref (email-headers email) 'from) + (match-lambda + ((from . rest) + (format #f "~a ~a" + (assoc-ref from 'name) + (assoc-ref from 'address))) + (from + (format #f "~a ~a" + (assoc-ref from 'name) + (assoc-ref from 'address))))) + "<unknown>")) + +(define* (index files #:key + (dbpath (string-append (%config 'db-dir) "/mumi.xapian"))) + (define (index-chunk files+mails) + (call-with-writable-database + dbpath + (lambda (db) + (for-each + (match-lambda + ((file (? pair? emails)) + (let* ((bugid (basename file ".log")) + (report (first emails)) + (submitter (author report)) + (subjects + (string-join + (delete-duplicates + (filter-map (lambda (email) + (assoc-ref (email-headers email) 'subject)) + emails)))) + (authors + (string-join (delete-duplicates (map author emails)) " ")) + (date + (match (assoc-ref (email-headers report) 'date) + ((? string? s) #f) + (date date))) + (mdate + (match (assoc-ref (email-headers (last emails)) 'date) + ((? string? s) #f) + (date date))) + (text + (string-join (map (match-lambda + ((($ <mime-entity> headers body) ...) + (string-join (filter string? body) "\n")) + ((? bytevector? bv) (utf8->string bv)) + ((? string? s) s)) + (map email-body emails)) + "\n")) + + (idterm (string-append "Q" bugid)) + (doc (make-document #:data bugid + #:terms `((,idterm . 0)) + #:values + `((0 . ,(or (and date (date->string date "~Y~m~d")) "19700101")) + (1 . ,(or (and mdate (date->string mdate "~Y~m~d")) "19700101"))))) + (term-generator (make-term-generator #:stem (make-stem "en") + #:document doc))) + ;; Index fields with a suitable prefix. This allows for + ;; searching separate fields as in subject:foo, + ;; from:bar, etc. + (index-text! term-generator bugid #:prefix "B") + (index-text! term-generator submitter #:prefix "A") + (index-text! term-generator authors #:prefix "XA") + (index-text! term-generator subjects #:prefix "S") + (index-text! term-generator file #:prefix "F") + + ;; Index subject and body without prefixes for general + ;; search. + (index-text! term-generator subjects) + (increase-termpos! term-generator) + (index-text! term-generator text) + + ;; Add the document to the database. The unique idterm + ;; ensures each object ends up in the database only once + ;; no matter how many times we run the indexer. + (replace-document! db idterm doc))) + (_ #f)) ; ignore + files+mails)))) + + (let ((total (length files))) + (let loop ((files files)) + (let-values (((chunk rest) + (if (>= (length files) 100) + (split-at files 100) + (values files '())))) + (format (current-error-port) + "indexing ~1,2f%~%" + (exact->inexact (* 100 (/ (- total (length rest)) total)))) + (index-chunk (parse-emails chunk)) + (unless (null? rest) + (loop rest)))))) + +(define* (parse-query* querystring #:key stemmer stemming-strategy (prefixes '())) + (let ((queryparser (new-QueryParser)) + (date-range-processor (new-DateRangeProcessor 0 "date:" 0)) + (mdate-range-processor (new-DateRangeProcessor 1 "mdate:" 0))) + (QueryParser-set-stemmer queryparser stemmer) + (when stemming-strategy + (QueryParser-set-stemming-strategy queryparser stemming-strategy)) + (for-each (match-lambda + ((field . prefix) + (QueryParser-add-prefix queryparser field prefix))) + prefixes) + (QueryParser-add-rangeprocessor queryparser date-range-processor) + (QueryParser-add-rangeprocessor queryparser mdate-range-processor) + (let ((query (QueryParser-parse-query queryparser querystring))) + (delete-QueryParser queryparser) + query))) + +(define* (search querystring #:key + (pagesize 10) + (dbpath (string-append (%config 'db-dir) "/mumi.xapian"))) + ;; Open database for reading. call-with-database automatically + ;; closes the database once we're done. + (call-with-database dbpath + (lambda (db) + (let (;; Parse querystring passing a stemmer and suitable + ;; prefixes for field search. + (query (parse-query* querystring + #:stemmer (make-stem "en") + #:prefixes '(("submitter" . "A") + ("author" . "XA") + ("subject" . "S"))))) + ;; Fold over the results, return bug id. + (mset-fold (lambda (item acc) + (cons + (document-data (mset-item-document item)) + acc)) + '() + ;; Get an Enquire object from the database with the + ;; search results. Then, extract the MSet from the + ;; Enquire object. + (enquire-mset (enquire db query) + #:maximum-items pagesize)))))) + |