summaryrefslogtreecommitdiff
path: root/lisp/thingatpt.el
diff options
context:
space:
mode:
Diffstat (limited to 'lisp/thingatpt.el')
-rw-r--r--lisp/thingatpt.el295
1 files changed, 206 insertions, 89 deletions
diff --git a/lisp/thingatpt.el b/lisp/thingatpt.el
index e1e3e8e1e4..9526cb76e7 100644
--- a/lisp/thingatpt.el
+++ b/lisp/thingatpt.el
@@ -232,7 +232,7 @@ The bounds of THING are determined by `bounds-of-thing-at-point'."
(put 'defun 'end-op 'end-of-defun)
(put 'defun 'forward-op 'end-of-defun)
-;; Filenames and URLs www.com/foo%32bar
+;; Filenames
(defvar thing-at-point-file-name-chars "-~/[:alnum:]_.${}#%,:"
"Characters allowable in filenames.")
@@ -248,94 +248,224 @@ The bounds of THING are determined by `bounds-of-thing-at-point'."
(forward-char)
(goto-char (point-min)))))
+;; URIs
+
+(defvar thing-at-point-beginning-of-url-regexp nil
+ "Regexp matching the beginning of a well-formed URI.
+If nil, construct the regexp from `thing-at-point-uri-schemes'.")
+
(defvar thing-at-point-url-path-regexp
"[^]\t\n \"'<>[^`{}]*[^]\t\n \"'<>[^`{}.,;]+"
- "A regular expression probably matching the host and filename or e-mail part of a URL.")
+ "Regexp matching the host and filename or e-mail part of a URL.")
(defvar thing-at-point-short-url-regexp
(concat "[-A-Za-z0-9]+\\.[-A-Za-z0-9.]+" thing-at-point-url-path-regexp)
- "A regular expression probably matching a URL without an access scheme.
-Hostname matching is stricter in this case than for
-``thing-at-point-url-regexp''.")
+ "Regexp matching a URI without a scheme component.")
(defvar thing-at-point-uri-schemes
;; Officials from http://www.iana.org/assignments/uri-schemes.html
- '("ftp://" "http://" "gopher://" "mailto:" "news:" "nntp:"
- "telnet://" "wais://" "file:/" "prospero:" "z39.50s:" "z39.50r:"
- "cid:" "mid:" "vemmi:" "service:" "imap:" "nfs:" "acap:" "rtsp:"
- "tip:" "pop:" "data:" "dav:" "opaquelocktoken:" "sip:" "tel:" "fax:"
- "modem:" "ldap:" "https://" "soap.beep:" "soap.beeps:" "urn:" "go:"
- "afs:" "tn3270:" "mailserver:"
- "crid:" "dict:" "dns:" "dtn:" "h323:" "im:" "info:" "ipp:"
- "iris.beep:" "mtqp:" "mupdate:" "pres:" "sips:" "snmp:" "tag:"
- "tftp:" "xmlrpc.beep:" "xmlrpc.beeps:" "xmpp:"
- ;; Compatibility
- "snews:" "irc:" "mms://" "mmsh://")
- "Uniform Resource Identifier (URI) Schemes.")
-
-(defvar thing-at-point-url-regexp
- (concat "\\<\\(" (mapconcat 'identity thing-at-point-uri-schemes "\\|") "\\)"
- thing-at-point-url-path-regexp)
- "A regular expression probably matching a complete URL.")
-
-(defvar thing-at-point-markedup-url-regexp
- "<URL:[^>]+>"
- "A regular expression matching a URL marked up per RFC1738.
-This may contain whitespace (including newlines) .")
+ '("aaa://" "about:" "acap://" "apt:" "bzr://" "bzr+ssh://"
+ "attachment:/" "chrome://" "cid:" "content://" "crid://" "cvs://"
+ "data:" "dav:" "dict://" "doi:" "dns:" "dtn:" "feed:" "file:/"
+ "finger://" "fish://" "ftp://" "geo:" "git://" "go:" "gopher://"
+ "h323:" "http://" "https://" "im:" "imap://" "info:" "ipp:"
+ "irc://" "irc6://" "ircs://" "iris.beep:" "jar:" "ldap://"
+ "ldaps://" "mailto:" "mid:" "mtqp://" "mupdate://" "news:"
+ "nfs://" "nntp://" "opaquelocktoken:" "pop://" "pres:"
+ "resource://" "rmi://" "rsync://" "rtsp://" "rtspu://" "service:"
+ "sftp://" "sip:" "sips:" "smb://" "sms:" "snmp://" "soap.beep://"
+ "soap.beeps://" "ssh://" "svn://" "svn+ssh://" "tag:" "tel:"
+ "telnet://" "tftp://" "tip://" "tn3270://" "udp://" "urn:"
+ "uuid:" "vemmi://" "webcal://" "xri://" "xmlrpc.beep://"
+ "xmlrpc.beeps://" "z39.50r://" "z39.50s://" "xmpp:"
+ ;; Compatibility
+ "fax:" "mms://" "mmsh://" "modem:" "prospero:" "snews:"
+ "wais://")
+ "List of URI schemes recognized by `thing-at-point-url-at-point'.
+Each string in this list should correspond to the start of a
+URI's scheme component, up to and including the trailing // if
+the scheme calls for that to be present.")
+
+(defvar thing-at-point-markedup-url-regexp "<URL:\\([^<>\n]+\\)>"
+ "Regexp matching a URL marked up per RFC1738.
+This kind of markup was formerly recommended as a way to indicate
+URIs, but as of RFC 3986 it is no longer recommended.
+Subexpression 1 should contain the delimited URL.")
+
+(defvar thing-at-point-newsgroup-regexp
+ "\\`[[:lower:]]+\\.[-+[:lower:]_0-9.]+\\'"
+ "Regexp matching a newsgroup name.")
+
+(defvar thing-at-point-newsgroup-heads
+ '("alt" "comp" "gnu" "misc" "news" "sci" "soc" "talk")
+ "Used by `thing-at-point-newsgroup-p' if gnus is not running.")
+
+(defvar thing-at-point-default-mail-uri-scheme "mailto"
+ "Default scheme for ill-formed URIs that look like <foo@example.com>.
+If nil, do not give such URIs a scheme.")
(put 'url 'bounds-of-thing-at-point 'thing-at-point-bounds-of-url-at-point)
-(defun thing-at-point-bounds-of-url-at-point ()
- (let ((strip (thing-at-point-looking-at
- thing-at-point-markedup-url-regexp))) ;; (url "") short
- (if (or strip
- (thing-at-point-looking-at thing-at-point-url-regexp)
- ;; Access scheme omitted?
- ;; (setq short (thing-at-point-looking-at
- ;; thing-at-point-short-url-regexp))
- )
- (let ((beginning (match-beginning 0))
- (end (match-end 0)))
- (when strip
- (setq beginning (+ beginning 5))
- (setq end (- end 1)))
- (cons beginning end)))))
+
+(defun thing-at-point-bounds-of-url-at-point (&optional lax)
+ "Return a cons cell containing the start and end of the URI at point.
+Try to find a URI using `thing-at-point-markedup-url-regexp'.
+If that fails, try with `thing-at-point-beginning-of-url-regexp'.
+If that also fails, and optional argument LAX is non-nil, return
+the bounds of a possible ill-formed URI (one lacking a scheme)."
+ ;; Look for the old <URL:foo> markup. If found, use it.
+ (or (thing-at-point--bounds-of-markedup-url)
+ ;; Otherwise, find the bounds within which a URI may exist. The
+ ;; method is similar to `ffap-string-at-point'. Note that URIs
+ ;; may contain parentheses but may not contain spaces (RFC3986).
+ (let* ((allowed-chars "--:=&?$+@-Z_[:alpha:]~#,%;*()!'")
+ (skip-before "^[0-9a-zA-Z]")
+ (skip-after ":;.,!?")
+ (pt (point))
+ (beg (save-excursion
+ (skip-chars-backward allowed-chars)
+ (skip-chars-forward skip-before pt)
+ (point)))
+ (end (save-excursion
+ (skip-chars-forward allowed-chars)
+ (skip-chars-backward skip-after pt)
+ (point))))
+ (or (thing-at-point--bounds-of-well-formed-url beg end pt)
+ (if lax (cons beg end))))))
+
+(defun thing-at-point--bounds-of-markedup-url ()
+ (when thing-at-point-markedup-url-regexp
+ (let ((case-fold-search t)
+ (pt (point))
+ (beg (line-beginning-position))
+ (end (line-end-position))
+ found)
+ (save-excursion
+ (goto-char beg)
+ (while (and (not found)
+ (<= (point) pt)
+ (< (point) end))
+ (and (re-search-forward thing-at-point-markedup-url-regexp
+ end 1)
+ (> (point) pt)
+ (setq found t))))
+ (if found
+ (cons (match-beginning 1) (match-end 1))))))
+
+(defun thing-at-point--bounds-of-well-formed-url (beg end pt)
+ (save-excursion
+ (goto-char beg)
+ (let (url-beg paren-end regexp)
+ (save-restriction
+ (narrow-to-region beg end)
+ ;; The scheme component must either match at BEG, or have no
+ ;; other alphanumerical ASCII characters before it.
+ (setq regexp (concat "\\(?:\\`\\|[^a-zA-Z0-9]\\)\\("
+ (or thing-at-point-beginning-of-url-regexp
+ (regexp-opt thing-at-point-uri-schemes))
+ "\\)"))
+ (and (re-search-forward regexp end t)
+ ;; URI must have non-empty contents.
+ (< (point) end)
+ (setq url-beg (match-beginning 1))))
+ (when url-beg
+ ;; If there is an open paren before the URI, truncate to the
+ ;; matching close paren.
+ (and (> url-beg (point-min))
+ (eq (car-safe (syntax-after (1- url-beg))) 4)
+ (save-restriction
+ (narrow-to-region (1- url-beg) (min end (point-max)))
+ (setq paren-end (ignore-errors
+ (scan-lists (1- url-beg) 1 0))))
+ (not (blink-matching-check-mismatch (1- url-beg) paren-end))
+ (setq end (1- paren-end)))
+ (cons url-beg end)))))
(put 'url 'thing-at-point 'thing-at-point-url-at-point)
-(defun thing-at-point-url-at-point ()
- "Return the URL around or before point.
-Search backwards for the start of a URL ending at or after point. If
-no URL found, return nil. The access scheme will be prepended if
-absent: \"mailto:\" if the string contains \"@\", \"ftp://\" if it
-starts with \"ftp\" and not \"ftp:/\", or \"http://\" by default."
-
- (let ((url "") short strip)
- (if (or (setq strip (thing-at-point-looking-at
- thing-at-point-markedup-url-regexp))
- (thing-at-point-looking-at thing-at-point-url-regexp)
- ;; Access scheme omitted?
- (setq short (thing-at-point-looking-at
- thing-at-point-short-url-regexp)))
- (progn
- (setq url (buffer-substring-no-properties (match-beginning 0)
- (match-end 0)))
- (and strip (setq url (substring url 5 -1))) ; Drop "<URL:" & ">"
- ;; strip whitespace
- (while (string-match "[ \t\n\r]+" url)
- (setq url (replace-match "" t t url)))
- (and short (setq url (concat (cond ((string-match "^[a-zA-Z]+:" url)
- ;; already has a URL scheme.
- "")
- ((string-match "@" url)
- "mailto:")
- ;; e.g. ftp.swiss... or ftp-swiss...
- ((string-match "^ftp" url)
- "ftp://")
- (t "http://"))
- url)))
- (if (string-equal "" url)
- nil
- url)))))
+(defun thing-at-point-url-at-point (&optional lax bounds)
+ "Return the URL around or before point.
+If no URL is found, return nil.
+
+If optional argument LAX is non-nil, look for URLs that are not
+well-formed, such as foo@bar or <nobody>.
+
+If optional arguments BOUNDS are non-nil, it should be a cons
+cell of the form (START . END), containing the beginning and end
+positions of the URI. Otherwise, these positions are detected
+automatically from the text around point.
+
+If the scheme component is absent, either because a URI delimited
+with <url:...> lacks one, or because an ill-formed URI was found
+with LAX or BEG and END, try to add a scheme in the returned URI.
+The scheme is chosen heuristically: \"mailto:\" if the address
+looks like an email address, \"ftp://\" if it starts with
+\"ftp\", etc."
+ (unless bounds
+ (setq bounds (thing-at-point-bounds-of-url-at-point lax)))
+ (when (and bounds (< (car bounds) (cdr bounds)))
+ (let ((str (buffer-substring-no-properties (car bounds) (cdr bounds))))
+ ;; If there is no scheme component, try to add one.
+ (unless (string-match "\\`[a-zA-Z][-a-zA-Z0-9+.]*:" str)
+ (or
+ ;; If the URI has the form <foo@bar>, treat it according to
+ ;; `thing-at-point-default-mail-uri-scheme'. If there are
+ ;; no angle brackets, it must be mailto.
+ (when (string-match "\\`[^:</>@]+@[-.0-9=&?$+A-Z_a-z~#,%;*]" str)
+ (let ((scheme (if (and (eq (char-before (car bounds)) ?<)
+ (eq (char-after (cdr bounds)) ?>))
+ thing-at-point-default-mail-uri-scheme
+ "mailto")))
+ (if scheme
+ (setq str (concat scheme ":" str)))))
+ ;; If the string is like <FOO>, where FOO is an existing user
+ ;; name on the system, treat that as an email address.
+ (and (string-match "\\`[[:alnum:]]+\\'" str)
+ (eq (char-before (car bounds)) ?<)
+ (eq (char-after (cdr bounds)) ?>)
+ (not (string-match "~" (expand-file-name (concat "~" str))))
+ (setq str (concat "mailto:" str)))
+ ;; If it looks like news.example.com, treat it as news.
+ (if (thing-at-point-newsgroup-p str)
+ (setq str (concat "news:" str)))
+ ;; If it looks like ftp.example.com. treat it as ftp.
+ (if (string-match "\\`ftp\\." str)
+ (setq str (concat "ftp://" str)))
+ ;; If it looks like www.example.com. treat it as http.
+ (if (string-match "\\`www\\." str)
+ (setq str (concat "http://" str)))
+ ;; Otherwise, it just isn't a URI.
+ (setq str nil)))
+ str)))
+
+(defun thing-at-point-newsgroup-p (string)
+ "Return STRING if it looks like a newsgroup name, else nil."
+ (and
+ (string-match thing-at-point-newsgroup-regexp string)
+ (let ((htbs '(gnus-active-hashtb gnus-newsrc-hashtb gnus-killed-hashtb))
+ (heads thing-at-point-newsgroup-heads)
+ htb ret)
+ (while htbs
+ (setq htb (car htbs) htbs (cdr htbs))
+ (condition-case nil
+ (progn
+ ;; errs: htb symbol may be unbound, or not a hash-table.
+ ;; gnus-gethash is just a macro for intern-soft.
+ (and (symbol-value htb)
+ (intern-soft string (symbol-value htb))
+ (setq ret string htbs nil))
+ ;; If we made it this far, gnus is running, so ignore "heads":
+ (setq heads nil))
+ (error nil)))
+ (or ret (not heads)
+ (let ((head (string-match "\\`\\([[:lower:]]+\\)\\." string)))
+ (and head (setq head (substring string 0 (match-end 1)))
+ (member head heads)
+ (setq ret string))))
+ ret)))
+
+(put 'url 'end-op (lambda () (end-of-thing 'url)))
+
+(put 'url 'beginning-op (lambda () (end-of-thing 'url)))
;; The normal thingatpt mechanism doesn't work for complex regexps.
;; This should work for almost any regexp wherever we are in the
@@ -372,19 +502,6 @@ point."
(goto-char match)
(looking-at regexp)))))
-(put 'url 'end-op
- (lambda ()
- (let ((bounds (thing-at-point-bounds-of-url-at-point)))
- (if bounds
- (goto-char (cdr bounds))
- (error "No URL here")))))
-(put 'url 'beginning-op
- (lambda ()
- (let ((bounds (thing-at-point-bounds-of-url-at-point)))
- (if bounds
- (goto-char (car bounds))
- (error "No URL here")))))
-
;; Email addresses
(defvar thing-at-point-email-regexp
"<?[-+_.~a-zA-Z][-+_.~:a-zA-Z0-9]*@[-.a-zA-Z0-9]+>?"