xapian: Collapse results for merged issues.
[software/mumi.git] / mumi / xapian.scm
1 ;;; mumi -- Mediocre, uh, mail interface
2 ;;; Copyright © 2020 Ricardo Wurmus <rekado@elephly.net>
3 ;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
4 ;;;
5 ;;; This program is free software: you can redistribute it and/or
6 ;;; modify it under the terms of the GNU Affero General Public License
7 ;;; as published by the Free Software Foundation, either version 3 of
8 ;;; the License, or (at your option) any later version.
9 ;;;
10 ;;; This program is distributed in the hope that it will be useful,
11 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;; Affero General Public License for more details.
14 ;;;
15 ;;; You should have received a copy of the GNU Affero General Public
16 ;;; License along with this program. If not, see
17 ;;; <http://www.gnu.org/licenses/>.
18
19 (define-module (mumi xapian)
20 #:use-module (mumi config)
21 #:use-module (mumi debbugs)
22 #:use-module (ice-9 format)
23 #:use-module (ice-9 match)
24 #:use-module (ice-9 threads)
25 #:use-module (srfi srfi-1)
26 #:use-module (srfi srfi-11)
27 #:use-module (srfi srfi-19)
28 #:use-module (srfi srfi-26)
29 #:use-module (email email)
30 #:use-module (xapian wrap)
31 #:use-module (xapian xapian)
32 #:use-module (rnrs bytevectors)
33 #:export (index! index-files tokenize search))
34
35 (define (parse-emails files)
36 (n-par-map 4
37 (lambda (file)
38 (list file
39 (call-with-input-file file
40 read-emails-from-bug-log)))
41 files))
42
43 (define (author email)
44 (or (and=> (assoc-ref (email-headers email) 'from)
45 (match-lambda
46 ((from . rest)
47 (format #f "~a ~a"
48 (assoc-ref from 'name)
49 (assoc-ref from 'address)))
50 (from
51 (format #f "~a ~a"
52 (assoc-ref from 'name)
53 (assoc-ref from 'address)))))
54 "<unknown>"))
55
56 (define* (index-files files #:key
57 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
58 "Parse all given Debbugs log files in FILES as a list of email
59 messages and index their contents in the Xapian database at DBPATH."
60 (define (index-chunk files+mails)
61 (call-with-writable-database
62 dbpath
63 (lambda (db)
64 (for-each
65 (match-lambda
66 ((file (? pair? emails))
67 (let* ((bugid (basename file ".log"))
68 (report (first emails))
69 (submitter (author report))
70 (subjects
71 (string-join
72 (delete-duplicates
73 (filter-map (lambda (email)
74 (assoc-ref (email-headers email) 'subject))
75 emails))))
76 (authors
77 (string-join (delete-duplicates (map author emails)) " "))
78 (date
79 (match (assoc-ref (email-headers report) 'date)
80 ((? string? s) #f)
81 (date date)))
82 (mdate
83 (match (assoc-ref (email-headers (last emails)) 'date)
84 ((? string? s) #f)
85 (date date)))
86 (text
87 (string-join (map (match-lambda
88 ((($ <mime-entity> headers body) ...)
89 (string-join (filter string? body) "\n"))
90 ((? bytevector? bv) (utf8->string bv))
91 ((? string? s) s))
92 (map email-body emails))
93 "\n"))
94
95 (bug (bug-status bugid))
96 (idterm (string-append "Q" bugid))
97 (doc (make-document #:data bugid
98 #:terms `((,idterm . 0))
99 #:values
100 `((0 . ,(or (and date (date->string date "~Y~m~d")) "19700101"))
101 (1 . ,(or (and mdate (date->string mdate "~Y~m~d")) "19700101"))
102 ;; This is used for collapsing search results
103 (2 . ,(let ((merged (bug-mergedwith bug)))
104 (if merged
105 (string-join
106 (sort (cons bugid (string-split merged #\space))
107 string<)
108 " ")
109 bugid))))))
110 (term-generator (make-term-generator #:stem (make-stem "en")
111 #:document doc)))
112 ;; Index fields with a suitable prefix. This allows for
113 ;; searching separate fields as in subject:foo,
114 ;; from:bar, etc.
115 (index-text! term-generator bugid #:prefix "B")
116 (index-text! term-generator submitter #:prefix "A")
117 (index-text! term-generator authors #:prefix "XA")
118 (index-text! term-generator subjects #:prefix "S")
119 (index-text! term-generator (or (bug-owner bug) "") #:prefix "XO")
120 (index-text! term-generator (or (bug-severity bug) "normal") #:prefix "XS")
121 (index-text! term-generator (or (bug-tags bug) "") #:prefix "XT")
122 (index-text! term-generator (cond
123 ((bug-done bug) "done")
124 (else "open")) #:prefix "XSTATUS")
125 (index-text! term-generator file #:prefix "F")
126
127 ;; Index subject and body without prefixes for general
128 ;; search.
129 (index-text! term-generator subjects)
130 (increase-termpos! term-generator)
131 (index-text! term-generator text)
132
133 ;; Add the document to the database. The unique idterm
134 ;; ensures each object ends up in the database only once
135 ;; no matter how many times we run the indexer.
136 (replace-document! db idterm doc)))
137 (_ #f)) ; ignore
138 files+mails))))
139
140 (let ((total (length files)))
141 (let loop ((files files))
142 (let-values (((chunk rest)
143 (if (>= (length files) 100)
144 (split-at files 100)
145 (values files '()))))
146 (format (current-error-port)
147 "indexing ~1,2f%~%"
148 (exact->inexact (* 100 (/ (- total (length rest)) total))))
149 (index-chunk (parse-emails chunk))
150 (unless (null? rest)
151 (loop rest))))))
152
153 (define (sanitize-date-range token)
154 (define now (current-time))
155 (define (hours n)
156 (make-time time-duration 0 (* 60 60 n)))
157 (define (days n)
158 (make-time time-duration 0 (* 60 60 24 n)))
159 (define (range-boundary->date-string boundary*)
160 (define boundary (string-delete #\- boundary*))
161 (cond
162 ((and (string->number boundary)
163 (<= (string-length boundary) 8))
164 (format #f "~8,,,'0a" boundary))
165 (else
166 (let ((date (match boundary
167 ((or "now" "today") now)
168 ("yesterday" (subtract-duration now (days 1)))
169 (_
170 (call-with-values
171 (lambda () (span char-numeric? (string->list boundary)))
172 (lambda (pre post)
173 (let ((n (string->number (apply string pre)))
174 (unit (apply string post)))
175 (and n
176 (subtract-duration
177 now
178 (match unit
179 ((or "h" "hour" "hours")
180 (hours n))
181 ((or "d" "day" "days")
182 (days n))
183 ((or "w" "week" "weeks")
184 (days (* n 7)))
185 ((or "m" "month" "months")
186 (days (* n 30)))
187 ((or "y" "year" "years")
188 (days (* n 365)))))))))))))
189 (and date
190 (date->string (time-utc->date date) "~Y~m~d"))))))
191 (if (or (string-prefix? "date:" token)
192 (string-prefix? "mdate:" token))
193 (match (string-split token (char-set #\: #\.))
194 ((prefix begin sep end)
195 (let ((begin* (range-boundary->date-string begin))
196 (end* (if (or (not end) (string-null? end)) ""
197 (or (range-boundary->date-string end) ""))))
198 (if begin*
199 (string-append prefix ":" begin* ".." end*)
200 (string-append begin " " end))))
201 ;; Swallow invalid tokens silently to avoid crashing Xapian
202 (invalid (pk invalid "")))
203 token))
204
205 (define* (parse-query* querystring #:key stemmer stemming-strategy (prefixes '()))
206 (let ((queryparser (new-QueryParser))
207 (date-range-processor (new-DateRangeProcessor 0 "date:" 0))
208 (mdate-range-processor (new-DateRangeProcessor 1 "mdate:" 0)))
209 (QueryParser-set-default-op queryparser (Query-OP-AND))
210 (QueryParser-set-stemmer queryparser stemmer)
211 (when stemming-strategy
212 (QueryParser-set-stemming-strategy queryparser stemming-strategy))
213 (for-each (match-lambda
214 ((field . prefix)
215 (QueryParser-add-prefix queryparser field prefix)))
216 prefixes)
217 (QueryParser-add-rangeprocessor queryparser date-range-processor)
218 (QueryParser-add-rangeprocessor queryparser mdate-range-processor)
219 (let ((query (QueryParser-parse-query queryparser querystring
220 (logior (QueryParser-FLAG-BOOLEAN)
221 (QueryParser-FLAG-PHRASE)
222 (QueryParser-FLAG-LOVEHATE)
223 (QueryParser-FLAG-BOOLEAN-ANY-CASE)
224 (QueryParser-FLAG-WILDCARD)))))
225 (delete-QueryParser queryparser)
226 query)))
227
228 (define (tokenize querystring)
229 "Split QUERYSTRING at word boundaries, but keep quoted phrases
230 intact."
231 (let ((intermediate
232 (string-fold (lambda (char result)
233 (match result
234 ;; Phrase!
235 ((#t previous rest)
236 (list (not (eq? char #\")) ; end of phrase?
237 (cons char previous)
238 rest))
239 ;; Everything else
240 ((#f previous rest)
241 (if (eq? char #\space)
242 ;; end of word
243 (list #f '()
244 (cons (apply string (reverse previous))
245 rest))
246 ;; continue word
247 (list (eq? char #\")
248 (cons char previous) rest)))))
249 '(#f () ())
250 querystring)))
251 ;; The last word is still just a bunch of characters.
252 (match intermediate
253 ((_ last query)
254 (reverse (cons (apply string (reverse last))
255 query))))))
256
257 (define* (search querystring #:key
258 (pagesize 1000)
259 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
260 ;; Open database for reading. call-with-database automatically
261 ;; closes the database once we're done.
262 (call-with-database dbpath
263 (lambda (db)
264 (let* ((querystring*
265 (string-join (map (lambda (token)
266 (if (or (string-prefix? "date:" token)
267 (string-prefix? "mdate:" token))
268 (sanitize-date-range token)
269 token))
270 (tokenize querystring))))
271 ;; Parse querystring passing a stemmer and suitable
272 ;; prefixes for field search.
273 (query (parse-query* querystring*
274 #:stemmer (make-stem "en")
275 #:prefixes '(("submitter" . "A")
276 ("author" . "XA")
277 ("subject" . "S")
278 ("owner" . "XO")
279 ("severity" . "XS")
280 ("tag" . "XT")
281 ("status" . "XSTATUS"))))
282 (enq (enquire db query)))
283 ;; Collapse on mergedwith value
284 (Enquire-set-collapse-key enq 2 1)
285 ;; Fold over the results, return bug id.
286 (mset-fold (lambda (item acc)
287 (cons
288 (document-data (mset-item-document item))
289 acc))
290 '()
291 ;; Get an Enquire object from the database with the
292 ;; search results. Then, extract the MSet from the
293 ;; Enquire object.
294 (enquire-mset enq
295 #:maximum-items pagesize))))))
296
297 (define* (index! #:key full?)
298 "Index all Debbugs log files corresponding to the selected
299 packages. When FULL? is #T process also archived issues."
300 (let* ((packages (%config 'packages))
301 (active-numbers (extract-bug-numbers packages))
302 (archived-numbers (extract-bug-numbers packages #:archived? #t))
303 (active-files
304 (map bug-id->log-file active-numbers))
305 (archived-files
306 (map (cut bug-id->log-file <> #:archived? #t)
307 archived-numbers)))
308 (index-files (if full?
309 (append active-files archived-files)
310 active-files))))