;;; mumi -- Mediocre, uh, mail interface ;;; Copyright © 2020, 2022 Ricardo Wurmus ;;; Copyright © 2020, 2022 Arun Isaac ;;; ;;; This program is free software: you can redistribute it and/or ;;; modify it under the terms of the GNU Affero General Public License ;;; as published by the Free Software Foundation, either version 3 of ;;; the License, or (at your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; Affero General Public License for more details. ;;; ;;; You should have received a copy of the GNU Affero General Public ;;; License along with this program. If not, see ;;; . (define-module (mumi xapian) #:use-module (mumi config) #:use-module (mumi debbugs) #:use-module ((mumi web util) #:select (msgid-hash)) #:use-module (ice-9 format) #:use-module (ice-9 match) #:use-module (ice-9 threads) #:use-module (srfi srfi-1) #:use-module (srfi srfi-11) #:use-module (srfi srfi-19) #:use-module (srfi srfi-26) #:use-module (email email) #:use-module (xapian wrap) #:use-module (xapian xapian) #:use-module (rnrs bytevectors) #:export (index! index-files tokenize search)) (define (parse-emails files) (n-par-map 4 (lambda (file) (list file (call-with-input-file file read-emails-from-bug-log))) files)) (define (author email) (or (and=> (assoc-ref (email-headers email) 'from) (match-lambda ((from . rest) (format #f "~a ~a" (assoc-ref from 'name) (assoc-ref from 'address))) (from (format #f "~a ~a" (assoc-ref from 'name) (assoc-ref from 'address))))) "")) (define* (index-files files #:key (dbpath (string-append (%config 'db-dir) "/mumi.xapian"))) "Parse all given Debbugs log files in FILES as a list of email messages and index their contents in the Xapian database at DBPATH." (define (index-chunk files+mails) (call-with-writable-database dbpath (lambda (db) (for-each (match-lambda ((file (? pair? emails)) (let* ((bugid (basename file ".log")) (report (first emails)) (submitter (author report)) (subjects (string-join (delete-duplicates (filter-map (lambda (email) (assoc-ref (email-headers email) 'subject)) emails)))) (authors (string-join (delete-duplicates (map author emails)) " ")) (date (match (assoc-ref (email-headers report) 'date) ((? string? s) #f) (date date))) (mdate (match (assoc-ref (email-headers (last emails)) 'date) ((? string? s) #f) (date date))) ;; Store the message ids as base64 strings of all ;; messages. (msgids (string-join (map msgid-hash (filter-map (lambda (headers) (assoc-ref headers 'message-id)) (map email-headers emails))) " ")) (text (string-join (map (match-lambda ((($ headers body) ...) (string-join (filter string? body) "\n")) ((? bytevector? bv) (utf8->string bv)) ((? string? s) s)) (map email-body emails)) "\n")) (bug (bug-status bugid)) (idterm (string-append "Q" bugid)) (doc (make-document #:data bugid #:terms `((,idterm . 0)) #:values `((0 . ,(or (and date (date->string date "~Y~m~d")) "19700101")) (1 . ,(or (and mdate (date->string mdate "~Y~m~d")) "19700101")) ;; This is used for collapsing search results (2 . ,(let ((merged (bug-mergedwith bug))) (if merged (string-join (sort (cons bugid (string-split merged #\space)) string<) " ") bugid)))))) (term-generator (make-term-generator #:stem (make-stem "en") #:document doc))) ;; Index fields with a suitable prefix. This allows for ;; searching separate fields as in subject:foo, from:bar, ;; etc. We do not keep track of the within document ;; frequencies of terms that will be used for boolean ;; filtering. We do not generate position information for ;; fields that will not need phrase searching or NEAR ;; searches. (index-text! term-generator bugid #:prefix "B" #:wdf-increment 0 #:positions? #f) (index-text! term-generator submitter #:prefix "A" #:wdf-increment 0) (index-text! term-generator authors #:prefix "XA" #:wdf-increment 0) (index-text! term-generator subjects #:prefix "S") (index-text! term-generator (or (bug-owner bug) "") #:prefix "XO" #:wdf-increment 0) (index-text! term-generator (or (bug-severity bug) "normal") #:prefix "XS" #:wdf-increment 0 #:positions? #f) (index-text! term-generator (or (bug-tags bug) "") #:prefix "XT" #:wdf-increment 0 #:positions? #f) (index-text! term-generator (cond ((bug-done bug) "done") (else "open")) #:prefix "XSTATUS" #:wdf-increment 0 #:positions? #f) (index-text! term-generator file #:prefix "F" #:wdf-increment 0 #:positions? #f) (index-text! term-generator msgids #:prefix "XU" #:wdf-increment 0 #:positions? #f) ;; Index subject and body without prefixes for general ;; search. (index-text! term-generator subjects) (increase-termpos! term-generator) (index-text! term-generator text) ;; Add the document to the database. The unique idterm ;; ensures each object ends up in the database only once ;; no matter how many times we run the indexer. (replace-document! db idterm doc))) (_ #f)) ; ignore files+mails)))) (let ((total (length files))) (let loop ((files files)) (let-values (((chunk rest) (if (>= (length files) 100) (split-at files 100) (values files '())))) (format (current-error-port) "indexing ~1,2f%~%" (exact->inexact (* 100 (/ (- total (length rest)) total)))) (index-chunk (parse-emails chunk)) (unless (null? rest) (loop rest)))))) (define (sanitize-date-range token) (define now (current-time)) (define (hours n) (make-time time-duration 0 (* 60 60 n))) (define (days n) (make-time time-duration 0 (* 60 60 24 n))) (define (range-boundary->date-string boundary*) (define boundary (string-delete #\- boundary*)) (cond ((and (string->number boundary) (<= (string-length boundary) 8)) (format #f "~8,,,'0a" boundary)) (else (let ((date (match boundary ((or "now" "today") now) ("yesterday" (subtract-duration now (days 1))) (_ (call-with-values (lambda () (span char-numeric? (string->list boundary))) (lambda (pre post) (let ((n (string->number (apply string pre))) (unit (apply string post))) (and n (subtract-duration now (match unit ((or "h" "hour" "hours") (hours n)) ((or "d" "day" "days") (days n)) ((or "w" "week" "weeks") (days (* n 7))) ((or "m" "month" "months") (days (* n 30))) ((or "y" "year" "years") (days (* n 365))))))))))))) (and date (date->string (time-utc->date date) "~Y~m~d")))))) (if (or (string-prefix? "date:" token) (string-prefix? "mdate:" token)) (match (string-split token (char-set #\: #\.)) ((prefix begin sep end) (let ((begin* (range-boundary->date-string begin)) (end* (if (or (not end) (string-null? end)) "" (or (range-boundary->date-string end) "")))) (if begin* (string-append prefix ":" begin* ".." end*) (string-append begin " " end)))) ;; Swallow invalid tokens silently to avoid crashing Xapian (invalid (pk invalid ""))) token)) (define* (parse-query* querystring #:key stemmer stemming-strategy (prefixes '()) (boolean-prefixes '())) (let ((queryparser (new-QueryParser)) (date-range-processor (new-DateRangeProcessor 0 "date:" 0)) (mdate-range-processor (new-DateRangeProcessor 1 "mdate:" 0))) (QueryParser-set-stemmer queryparser stemmer) (when stemming-strategy (QueryParser-set-stemming-strategy queryparser stemming-strategy)) (for-each (match-lambda ((field . prefix) (QueryParser-add-prefix queryparser field prefix))) prefixes) (for-each (match-lambda ((field . prefix) (QueryParser-add-boolean-prefix queryparser field prefix))) boolean-prefixes) (QueryParser-add-rangeprocessor queryparser date-range-processor) (QueryParser-add-rangeprocessor queryparser mdate-range-processor) (let ((query (QueryParser-parse-query queryparser querystring (logior (QueryParser-FLAG-BOOLEAN) (QueryParser-FLAG-PHRASE) (QueryParser-FLAG-LOVEHATE) (QueryParser-FLAG-BOOLEAN-ANY-CASE) (QueryParser-FLAG-WILDCARD))))) (delete-QueryParser queryparser) query))) (define (tokenize querystring) "Split QUERYSTRING at word boundaries, but keep quoted phrases intact." (let ((intermediate (string-fold (lambda (char result) (match result ;; Phrase! ((#t previous rest) (list (not (eq? char #\")) ; end of phrase? (cons char previous) rest)) ;; Everything else ((#f previous rest) (if (eq? char #\space) ;; end of word (list #f '() (cons (apply string (reverse previous)) rest)) ;; continue word (list (eq? char #\") (cons char previous) rest))))) '(#f () ()) querystring))) ;; The last word is still just a bunch of characters. (match intermediate ((_ last query) (reverse (cons (apply string (reverse last)) query)))))) (define* (search querystring #:key (pagesize 1000) (dbpath (string-append (%config 'db-dir) "/mumi.xapian"))) ;; Open database for reading. call-with-database automatically ;; closes the database once we're done. (call-with-database dbpath (lambda (db) (let* ((querystring* (string-join (map (lambda (token) (cond ((or (string-prefix? "date:" token) (string-prefix? "mdate:" token)) (sanitize-date-range token)) ((string-prefix? "msgid:" token) (let ((msgid (substring token (string-length "msgid:")))) (format #false "msgid:~a" (msgid-hash msgid)))) (else token))) (tokenize querystring)))) ;; Parse querystring passing a stemmer and suitable ;; prefixes for field search. (query (parse-query* querystring* #:stemmer (make-stem "en") #:prefixes '(("subject" . "S") ("submitter" . "A") ("author" . "XA") ("owner" . "XO")) #:boolean-prefixes '(("msgid" . "XU") ("severity" . "XS") ("status" . "XSTATUS") ("tag" . "XT")))) (enq (enquire db query))) ;; Collapse on mergedwith value (Enquire-set-collapse-key enq 2 1) ;; Fold over the results, return bug id. (reverse (mset-fold (lambda (item acc) (cons (document-data (mset-item-document item)) acc)) '() ;; Get an Enquire object from the database with the ;; search results. Then, extract the MSet from the ;; Enquire object. (enquire-mset enq #:maximum-items pagesize))))))) (define* (index! #:key full?) "Index all Debbugs log files corresponding to the selected packages. When FULL? is #T process also archived issues." (let* ((packages (%config 'packages)) (active-numbers (extract-bug-numbers packages)) (archived-numbers (extract-bug-numbers packages #:archived? #t)) (active-files (map bug-id->log-file active-numbers)) (archived-files (map (cut bug-id->log-file <> #:archived? #t) archived-numbers))) (index-files (if full? (append active-files archived-files) active-files))))