f7573c7002e7e31247570d7e54edeae7710e6181
[software/mumi.git] / mumi / xapian.scm
1 ;;; mumi -- Mediocre, uh, mail interface
2 ;;; Copyright © 2020 Ricardo Wurmus <rekado@elephly.net>
3 ;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
4 ;;;
5 ;;; This program is free software: you can redistribute it and/or
6 ;;; modify it under the terms of the GNU Affero General Public License
7 ;;; as published by the Free Software Foundation, either version 3 of
8 ;;; the License, or (at your option) any later version.
9 ;;;
10 ;;; This program is distributed in the hope that it will be useful,
11 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;; Affero General Public License for more details.
14 ;;;
15 ;;; You should have received a copy of the GNU Affero General Public
16 ;;; License along with this program. If not, see
17 ;;; <http://www.gnu.org/licenses/>.
18
19 (define-module (mumi xapian)
20 #:use-module (mumi config)
21 #:use-module (mumi debbugs)
22 #:use-module (ice-9 format)
23 #:use-module (ice-9 match)
24 #:use-module (ice-9 threads)
25 #:use-module (srfi srfi-1)
26 #:use-module (srfi srfi-11)
27 #:use-module (srfi srfi-19)
28 #:use-module (srfi srfi-26)
29 #:use-module (email email)
30 #:use-module (xapian wrap)
31 #:use-module (xapian xapian)
32 #:use-module (rnrs bytevectors)
33 #:export (index search))
34
35 (define (parse-emails files)
36 (n-par-map 4
37 (lambda (file)
38 (list file
39 (call-with-input-file file
40 read-emails-from-bug-log)))
41 files))
42
43 (define (author email)
44 (or (and=> (assoc-ref (email-headers email) 'from)
45 (match-lambda
46 ((from . rest)
47 (format #f "~a ~a"
48 (assoc-ref from 'name)
49 (assoc-ref from 'address)))
50 (from
51 (format #f "~a ~a"
52 (assoc-ref from 'name)
53 (assoc-ref from 'address)))))
54 "<unknown>"))
55
56 (define* (index files #:key
57 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
58 (define (index-chunk files+mails)
59 (call-with-writable-database
60 dbpath
61 (lambda (db)
62 (for-each
63 (match-lambda
64 ((file (? pair? emails))
65 (let* ((bugid (basename file ".log"))
66 (report (first emails))
67 (submitter (author report))
68 (subjects
69 (string-join
70 (delete-duplicates
71 (filter-map (lambda (email)
72 (assoc-ref (email-headers email) 'subject))
73 emails))))
74 (authors
75 (string-join (delete-duplicates (map author emails)) " "))
76 (date
77 (match (assoc-ref (email-headers report) 'date)
78 ((? string? s) #f)
79 (date date)))
80 (mdate
81 (match (assoc-ref (email-headers (last emails)) 'date)
82 ((? string? s) #f)
83 (date date)))
84 (text
85 (string-join (map (match-lambda
86 ((($ <mime-entity> headers body) ...)
87 (string-join (filter string? body) "\n"))
88 ((? bytevector? bv) (utf8->string bv))
89 ((? string? s) s))
90 (map email-body emails))
91 "\n"))
92
93 (idterm (string-append "Q" bugid))
94 (doc (make-document #:data bugid
95 #:terms `((,idterm . 0))
96 #:values
97 `((0 . ,(or (and date (date->string date "~Y~m~d")) "19700101"))
98 (1 . ,(or (and mdate (date->string mdate "~Y~m~d")) "19700101")))))
99 (term-generator (make-term-generator #:stem (make-stem "en")
100 #:document doc)))
101 ;; Index fields with a suitable prefix. This allows for
102 ;; searching separate fields as in subject:foo,
103 ;; from:bar, etc.
104 (index-text! term-generator bugid #:prefix "B")
105 (index-text! term-generator submitter #:prefix "A")
106 (index-text! term-generator authors #:prefix "XA")
107 (index-text! term-generator subjects #:prefix "S")
108 (index-text! term-generator file #:prefix "F")
109
110 ;; Index subject and body without prefixes for general
111 ;; search.
112 (index-text! term-generator subjects)
113 (increase-termpos! term-generator)
114 (index-text! term-generator text)
115
116 ;; Add the document to the database. The unique idterm
117 ;; ensures each object ends up in the database only once
118 ;; no matter how many times we run the indexer.
119 (replace-document! db idterm doc)))
120 (_ #f)) ; ignore
121 files+mails))))
122
123 (let ((total (length files)))
124 (let loop ((files files))
125 (let-values (((chunk rest)
126 (if (>= (length files) 100)
127 (split-at files 100)
128 (values files '()))))
129 (format (current-error-port)
130 "indexing ~1,2f%~%"
131 (exact->inexact (* 100 (/ (- total (length rest)) total))))
132 (index-chunk (parse-emails chunk))
133 (unless (null? rest)
134 (loop rest))))))
135
136 (define* (parse-query* querystring #:key stemmer stemming-strategy (prefixes '()))
137 (let ((queryparser (new-QueryParser))
138 (date-range-processor (new-DateRangeProcessor 0 "date:" 0))
139 (mdate-range-processor (new-DateRangeProcessor 1 "mdate:" 0)))
140 (QueryParser-set-stemmer queryparser stemmer)
141 (when stemming-strategy
142 (QueryParser-set-stemming-strategy queryparser stemming-strategy))
143 (for-each (match-lambda
144 ((field . prefix)
145 (QueryParser-add-prefix queryparser field prefix)))
146 prefixes)
147 (QueryParser-add-rangeprocessor queryparser date-range-processor)
148 (QueryParser-add-rangeprocessor queryparser mdate-range-processor)
149 (let ((query (QueryParser-parse-query queryparser querystring)))
150 (delete-QueryParser queryparser)
151 query)))
152
153 (define* (search querystring #:key
154 (pagesize 10)
155 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
156 ;; Open database for reading. call-with-database automatically
157 ;; closes the database once we're done.
158 (call-with-database dbpath
159 (lambda (db)
160 (let (;; Parse querystring passing a stemmer and suitable
161 ;; prefixes for field search.
162 (query (parse-query* querystring
163 #:stemmer (make-stem "en")
164 #:prefixes '(("submitter" . "A")
165 ("author" . "XA")
166 ("subject" . "S")))))
167 ;; Fold over the results, return bug id.
168 (mset-fold (lambda (item acc)
169 (cons
170 (document-data (mset-item-document item))
171 acc))
172 '()
173 ;; Get an Enquire object from the database with the
174 ;; search results. Then, extract the MSet from the
175 ;; Enquire object.
176 (enquire-mset (enquire db query)
177 #:maximum-items pagesize))))))
178