xapian: Allow boolean search without capitalization, allow wildcards.
[software/mumi.git] / mumi / xapian.scm
1 ;;; mumi -- Mediocre, uh, mail interface
2 ;;; Copyright © 2020 Ricardo Wurmus <rekado@elephly.net>
3 ;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
4 ;;;
5 ;;; This program is free software: you can redistribute it and/or
6 ;;; modify it under the terms of the GNU Affero General Public License
7 ;;; as published by the Free Software Foundation, either version 3 of
8 ;;; the License, or (at your option) any later version.
9 ;;;
10 ;;; This program is distributed in the hope that it will be useful,
11 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;; Affero General Public License for more details.
14 ;;;
15 ;;; You should have received a copy of the GNU Affero General Public
16 ;;; License along with this program. If not, see
17 ;;; <http://www.gnu.org/licenses/>.
18
19 (define-module (mumi xapian)
20 #:use-module (mumi config)
21 #:use-module (mumi debbugs)
22 #:use-module (ice-9 format)
23 #:use-module (ice-9 match)
24 #:use-module (ice-9 threads)
25 #:use-module (srfi srfi-1)
26 #:use-module (srfi srfi-11)
27 #:use-module (srfi srfi-19)
28 #:use-module (srfi srfi-26)
29 #:use-module (email email)
30 #:use-module (xapian wrap)
31 #:use-module (xapian xapian)
32 #:use-module (rnrs bytevectors)
33 #:export (index! index-files search))
34
35 (define (parse-emails files)
36 (n-par-map 4
37 (lambda (file)
38 (list file
39 (call-with-input-file file
40 read-emails-from-bug-log)))
41 files))
42
43 (define (author email)
44 (or (and=> (assoc-ref (email-headers email) 'from)
45 (match-lambda
46 ((from . rest)
47 (format #f "~a ~a"
48 (assoc-ref from 'name)
49 (assoc-ref from 'address)))
50 (from
51 (format #f "~a ~a"
52 (assoc-ref from 'name)
53 (assoc-ref from 'address)))))
54 "<unknown>"))
55
56 (define* (index-files files #:key
57 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
58 "Parse all given Debbugs log files in FILES as a list of email
59 messages and index their contents in the Xapian database at DBPATH."
60 (define (index-chunk files+mails)
61 (call-with-writable-database
62 dbpath
63 (lambda (db)
64 (for-each
65 (match-lambda
66 ((file (? pair? emails))
67 (let* ((bugid (basename file ".log"))
68 (report (first emails))
69 (submitter (author report))
70 (subjects
71 (string-join
72 (delete-duplicates
73 (filter-map (lambda (email)
74 (assoc-ref (email-headers email) 'subject))
75 emails))))
76 (authors
77 (string-join (delete-duplicates (map author emails)) " "))
78 (date
79 (match (assoc-ref (email-headers report) 'date)
80 ((? string? s) #f)
81 (date date)))
82 (mdate
83 (match (assoc-ref (email-headers (last emails)) 'date)
84 ((? string? s) #f)
85 (date date)))
86 (text
87 (string-join (map (match-lambda
88 ((($ <mime-entity> headers body) ...)
89 (string-join (filter string? body) "\n"))
90 ((? bytevector? bv) (utf8->string bv))
91 ((? string? s) s))
92 (map email-body emails))
93 "\n"))
94
95 (idterm (string-append "Q" bugid))
96 (doc (make-document #:data bugid
97 #:terms `((,idterm . 0))
98 #:values
99 `((0 . ,(or (and date (date->string date "~Y~m~d")) "19700101"))
100 (1 . ,(or (and mdate (date->string mdate "~Y~m~d")) "19700101")))))
101 (term-generator (make-term-generator #:stem (make-stem "en")
102 #:document doc)))
103 ;; Index fields with a suitable prefix. This allows for
104 ;; searching separate fields as in subject:foo,
105 ;; from:bar, etc.
106 (index-text! term-generator bugid #:prefix "B")
107 (index-text! term-generator submitter #:prefix "A")
108 (index-text! term-generator authors #:prefix "XA")
109 (index-text! term-generator subjects #:prefix "S")
110 (index-text! term-generator file #:prefix "F")
111
112 ;; Index subject and body without prefixes for general
113 ;; search.
114 (index-text! term-generator subjects)
115 (increase-termpos! term-generator)
116 (index-text! term-generator text)
117
118 ;; Add the document to the database. The unique idterm
119 ;; ensures each object ends up in the database only once
120 ;; no matter how many times we run the indexer.
121 (replace-document! db idterm doc)))
122 (_ #f)) ; ignore
123 files+mails))))
124
125 (let ((total (length files)))
126 (let loop ((files files))
127 (let-values (((chunk rest)
128 (if (>= (length files) 100)
129 (split-at files 100)
130 (values files '()))))
131 (format (current-error-port)
132 "indexing ~1,2f%~%"
133 (exact->inexact (* 100 (/ (- total (length rest)) total))))
134 (index-chunk (parse-emails chunk))
135 (unless (null? rest)
136 (loop rest))))))
137
138 (define (sanitize-date-range token)
139 (define now (current-time))
140 (define (hours n)
141 (make-time time-duration 0 (* 60 60 n)))
142 (define (days n)
143 (make-time time-duration 0 (* 60 60 24 n)))
144 (define (range-boundary->date-string boundary*)
145 (define boundary (string-delete #\- boundary*))
146 (cond
147 ((and (string->number boundary)
148 (<= (string-length boundary) 8))
149 (format #f "~8,,,'0a" boundary))
150 (else
151 (let ((date (match boundary
152 ((or "now" "today") now)
153 ("yesterday" (subtract-duration now (days 1)))
154 (_
155 (call-with-values
156 (lambda () (span char-numeric? (string->list boundary)))
157 (lambda (pre post)
158 (let ((n (string->number (apply string pre)))
159 (unit (apply string post)))
160 (and n
161 (subtract-duration
162 now
163 (match unit
164 ((or "h" "hour" "hours")
165 (hours n))
166 ((or "d" "day" "days")
167 (days n))
168 ((or "w" "week" "weeks")
169 (days (* n 7)))
170 ((or "m" "month" "months")
171 (days (* n 30)))
172 ((or "y" "year" "years")
173 (days (* n 365)))))))))))))
174 (and date
175 (date->string (time-utc->date date) "~Y~m~d"))))))
176 (if (or (string-prefix? "date:" token)
177 (string-prefix? "mdate:" token))
178 (match (string-split token (char-set #\: #\.))
179 ((prefix begin sep end)
180 (let ((begin* (range-boundary->date-string begin))
181 (end* (if (or (not end) (string-null? end)) ""
182 (or (range-boundary->date-string end) ""))))
183 (if begin*
184 (string-append prefix ":" begin* ".." end*)
185 (string-append begin " " end))))
186 ;; Swallow invalid tokens silently to avoid crashing Xapian
187 (invalid (pk invalid "")))
188 token))
189
190 (define* (parse-query* querystring #:key stemmer stemming-strategy (prefixes '()))
191 (let ((queryparser (new-QueryParser))
192 (date-range-processor (new-DateRangeProcessor 0 "date:" 0))
193 (mdate-range-processor (new-DateRangeProcessor 1 "mdate:" 0)))
194 (QueryParser-set-default-op queryparser (Query-OP-AND))
195 (QueryParser-set-stemmer queryparser stemmer)
196 (when stemming-strategy
197 (QueryParser-set-stemming-strategy queryparser stemming-strategy))
198 (for-each (match-lambda
199 ((field . prefix)
200 (QueryParser-add-prefix queryparser field prefix)))
201 prefixes)
202 (QueryParser-add-rangeprocessor queryparser date-range-processor)
203 (QueryParser-add-rangeprocessor queryparser mdate-range-processor)
204 (let ((query (QueryParser-parse-query queryparser querystring
205 (logior (QueryParser-FLAG-BOOLEAN)
206 (QueryParser-FLAG-PHRASE)
207 (QueryParser-FLAG-LOVEHATE)
208 (QueryParser-FLAG-BOOLEAN-ANY-CASE)
209 (QueryParser-FLAG-WILDCARD)))))
210 (delete-QueryParser queryparser)
211 query)))
212
213 (define (tokenize querystring)
214 "Split QUERYSTRING at word boundaries, but keep quoted phrases
215 intact."
216 (let ((intermediate
217 (string-fold (lambda (char result)
218 (match result
219 ;; Phrase!
220 ((#t previous rest)
221 (list (not (eq? char #\")) ; end of phrase?
222 (cons char previous)
223 rest))
224 ;; Everything else
225 ((#f previous rest)
226 (if (eq? char #\space)
227 ;; end of word
228 (list #f '()
229 (cons (apply string (reverse previous))
230 rest))
231 ;; continue word
232 (list (eq? char #\")
233 (cons char previous) rest)))))
234 '(#f () ())
235 querystring)))
236 ;; The last word is still just a bunch of characters.
237 (match intermediate
238 ((_ last query)
239 (reverse (cons (apply string (reverse last))
240 query))))))
241
242 (define* (search querystring #:key
243 (pagesize 1000)
244 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
245 ;; Open database for reading. call-with-database automatically
246 ;; closes the database once we're done.
247 (call-with-database dbpath
248 (lambda (db)
249 (let* ((querystring*
250 (string-join (map (lambda (token)
251 (if (or (string-prefix? "date:" token)
252 (string-prefix? "mdate:" token))
253 (sanitize-date-range token)
254 token))
255 (tokenize querystring))))
256 ;; Parse querystring passing a stemmer and suitable
257 ;; prefixes for field search.
258 (query (parse-query* querystring*
259 #:stemmer (make-stem "en")
260 #:prefixes '(("submitter" . "A")
261 ("author" . "XA")
262 ("subject" . "S")))))
263 ;; Fold over the results, return bug id.
264 (mset-fold (lambda (item acc)
265 (cons
266 (document-data (mset-item-document item))
267 acc))
268 '()
269 ;; Get an Enquire object from the database with the
270 ;; search results. Then, extract the MSet from the
271 ;; Enquire object.
272 (enquire-mset (enquire db query)
273 #:maximum-items pagesize))))))
274
275 (define* (index! #:key full?)
276 "Index all Debbugs log files corresponding to the selected
277 packages. When FULL? is #T process also archived issues."
278 (let* ((packages (%config 'packages))
279 (active-numbers (extract-bug-numbers packages))
280 (archived-numbers (extract-bug-numbers packages #:archived? #t))
281 (active-files
282 (map bug-id->log-file active-numbers))
283 (archived-files
284 (map (cut bug-id->log-file <> #:archived? #t)
285 archived-numbers)))
286 (index-files (if full?
287 (append active-files archived-files)
288 active-files))))