8546e7821e86365c5ee31a8834010c204c815edf
[software/mumi.git] / mumi / xapian.scm
1 ;;; mumi -- Mediocre, uh, mail interface
2 ;;; Copyright © 2020 Ricardo Wurmus <rekado@elephly.net>
3 ;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
4 ;;;
5 ;;; This program is free software: you can redistribute it and/or
6 ;;; modify it under the terms of the GNU Affero General Public License
7 ;;; as published by the Free Software Foundation, either version 3 of
8 ;;; the License, or (at your option) any later version.
9 ;;;
10 ;;; This program is distributed in the hope that it will be useful,
11 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;; Affero General Public License for more details.
14 ;;;
15 ;;; You should have received a copy of the GNU Affero General Public
16 ;;; License along with this program. If not, see
17 ;;; <http://www.gnu.org/licenses/>.
18
19 (define-module (mumi xapian)
20 #:use-module (mumi config)
21 #:use-module (mumi debbugs)
22 #:use-module (ice-9 format)
23 #:use-module (ice-9 match)
24 #:use-module (ice-9 threads)
25 #:use-module (srfi srfi-1)
26 #:use-module (srfi srfi-11)
27 #:use-module (srfi srfi-19)
28 #:use-module (srfi srfi-26)
29 #:use-module (email email)
30 #:use-module (xapian wrap)
31 #:use-module (xapian xapian)
32 #:use-module (rnrs bytevectors)
33 #:export (index! index-files search))
34
35 (define (parse-emails files)
36 (n-par-map 4
37 (lambda (file)
38 (list file
39 (call-with-input-file file
40 read-emails-from-bug-log)))
41 files))
42
43 (define (author email)
44 (or (and=> (assoc-ref (email-headers email) 'from)
45 (match-lambda
46 ((from . rest)
47 (format #f "~a ~a"
48 (assoc-ref from 'name)
49 (assoc-ref from 'address)))
50 (from
51 (format #f "~a ~a"
52 (assoc-ref from 'name)
53 (assoc-ref from 'address)))))
54 "<unknown>"))
55
56 (define* (index-files files #:key
57 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
58 "Parse all given Debbugs log files in FILES as a list of email
59 messages and index their contents in the Xapian database at DBPATH."
60 (define (index-chunk files+mails)
61 (call-with-writable-database
62 dbpath
63 (lambda (db)
64 (for-each
65 (match-lambda
66 ((file (? pair? emails))
67 (let* ((bugid (basename file ".log"))
68 (report (first emails))
69 (submitter (author report))
70 (subjects
71 (string-join
72 (delete-duplicates
73 (filter-map (lambda (email)
74 (assoc-ref (email-headers email) 'subject))
75 emails))))
76 (authors
77 (string-join (delete-duplicates (map author emails)) " "))
78 (date
79 (match (assoc-ref (email-headers report) 'date)
80 ((? string? s) #f)
81 (date date)))
82 (mdate
83 (match (assoc-ref (email-headers (last emails)) 'date)
84 ((? string? s) #f)
85 (date date)))
86 (text
87 (string-join (map (match-lambda
88 ((($ <mime-entity> headers body) ...)
89 (string-join (filter string? body) "\n"))
90 ((? bytevector? bv) (utf8->string bv))
91 ((? string? s) s))
92 (map email-body emails))
93 "\n"))
94
95 (idterm (string-append "Q" bugid))
96 (doc (make-document #:data bugid
97 #:terms `((,idterm . 0))
98 #:values
99 `((0 . ,(or (and date (date->string date "~Y~m~d")) "19700101"))
100 (1 . ,(or (and mdate (date->string mdate "~Y~m~d")) "19700101")))))
101 (term-generator (make-term-generator #:stem (make-stem "en")
102 #:document doc)))
103 ;; Index fields with a suitable prefix. This allows for
104 ;; searching separate fields as in subject:foo,
105 ;; from:bar, etc.
106 (index-text! term-generator bugid #:prefix "B")
107 (index-text! term-generator submitter #:prefix "A")
108 (index-text! term-generator authors #:prefix "XA")
109 (index-text! term-generator subjects #:prefix "S")
110 (index-text! term-generator file #:prefix "F")
111
112 ;; Index subject and body without prefixes for general
113 ;; search.
114 (index-text! term-generator subjects)
115 (increase-termpos! term-generator)
116 (index-text! term-generator text)
117
118 ;; Add the document to the database. The unique idterm
119 ;; ensures each object ends up in the database only once
120 ;; no matter how many times we run the indexer.
121 (replace-document! db idterm doc)))
122 (_ #f)) ; ignore
123 files+mails))))
124
125 (let ((total (length files)))
126 (let loop ((files files))
127 (let-values (((chunk rest)
128 (if (>= (length files) 100)
129 (split-at files 100)
130 (values files '()))))
131 (format (current-error-port)
132 "indexing ~1,2f%~%"
133 (exact->inexact (* 100 (/ (- total (length rest)) total))))
134 (index-chunk (parse-emails chunk))
135 (unless (null? rest)
136 (loop rest))))))
137
138 (define (sanitize-date-range token)
139 (define now (current-time))
140 (define (hours n)
141 (make-time time-duration 0 (* 60 60 n)))
142 (define (days n)
143 (make-time time-duration 0 (* 60 60 24 n)))
144 (define (range-boundary->date-string boundary*)
145 (define boundary (string-delete #\- boundary*))
146 (cond
147 ((and (string->number boundary)
148 (<= (string-length boundary) 8))
149 (format #f "~8,,,'0a" boundary))
150 (else
151 (let ((date (match boundary
152 ((or "now" "today") now)
153 ("yesterday" (subtract-duration now (days 1)))
154 (_
155 (call-with-values
156 (lambda () (span char-numeric? (string->list boundary)))
157 (lambda (pre post)
158 (let ((n (string->number (apply string pre)))
159 (unit (apply string post)))
160 (and n
161 (subtract-duration
162 now
163 (match unit
164 ((or "h" "hour" "hours")
165 (hours n))
166 ((or "d" "day" "days")
167 (days n))
168 ((or "w" "week" "weeks")
169 (days (* n 7)))
170 ((or "m" "month" "months")
171 (days (* n 30)))
172 ((or "y" "year" "years")
173 (days (* n 365)))))))))))))
174 (and date
175 (date->string (time-utc->date date) "~Y~m~d"))))))
176 (if (or (string-prefix? "date:" token)
177 (string-prefix? "mdate:" token))
178 (match (string-split token (char-set #\: #\.))
179 ((prefix begin sep end)
180 (let ((begin* (range-boundary->date-string begin))
181 (end* (if (or (not end) (string-null? end)) ""
182 (or (range-boundary->date-string end) ""))))
183 (if begin*
184 (string-append prefix ":" begin* ".." end*)
185 (string-append begin " " end))))
186 ;; Swallow invalid tokens silently to avoid crashing Xapian
187 (invalid (pk invalid "")))
188 token))
189
190 (define* (parse-query* querystring #:key stemmer stemming-strategy (prefixes '()))
191 (let ((queryparser (new-QueryParser))
192 (date-range-processor (new-DateRangeProcessor 0 "date:" 0))
193 (mdate-range-processor (new-DateRangeProcessor 1 "mdate:" 0)))
194 (QueryParser-set-stemmer queryparser stemmer)
195 (when stemming-strategy
196 (QueryParser-set-stemming-strategy queryparser stemming-strategy))
197 (for-each (match-lambda
198 ((field . prefix)
199 (QueryParser-add-prefix queryparser field prefix)))
200 prefixes)
201 (QueryParser-add-rangeprocessor queryparser date-range-processor)
202 (QueryParser-add-rangeprocessor queryparser mdate-range-processor)
203 (let ((query (QueryParser-parse-query queryparser querystring)))
204 (delete-QueryParser queryparser)
205 query)))
206
207 (define* (search querystring #:key
208 (pagesize 1000)
209 (dbpath (string-append (%config 'db-dir) "/mumi.xapian")))
210 ;; Open database for reading. call-with-database automatically
211 ;; closes the database once we're done.
212 (call-with-database dbpath
213 (lambda (db)
214 (let* ((querystring*
215 (string-join (map (lambda (token)
216 (if (or (string-prefix? "date:" token)
217 (string-prefix? "mdate:" token))
218 (sanitize-date-range token)
219 token))
220 (string-tokenize querystring))))
221 ;; Parse querystring passing a stemmer and suitable
222 ;; prefixes for field search.
223 (query (parse-query* querystring*
224 #:stemmer (make-stem "en")
225 #:prefixes '(("submitter" . "A")
226 ("author" . "XA")
227 ("subject" . "S")))))
228 ;; Fold over the results, return bug id.
229 (mset-fold (lambda (item acc)
230 (cons
231 (document-data (mset-item-document item))
232 acc))
233 '()
234 ;; Get an Enquire object from the database with the
235 ;; search results. Then, extract the MSet from the
236 ;; Enquire object.
237 (enquire-mset (enquire db query)
238 #:maximum-items pagesize))))))
239
240 (define* (index! #:key full?)
241 "Index all Debbugs log files corresponding to the selected
242 packages. When FULL? is #T process also archived issues."
243 (let* ((packages (%config 'packages))
244 (active-numbers (extract-bug-numbers packages))
245 (archived-numbers (extract-bug-numbers packages #:archived? #t))
246 (active-files
247 (map bug-id->log-file active-numbers))
248 (archived-files
249 (map (cut bug-id->log-file <> #:archived? #t)
250 archived-numbers)))
251 (index-files (if full?
252 (append active-files archived-files)
253 active-files))))