|
|
- ;;; org-ref-url-utils.el --- Utility functions to scrape DOIs from urls -*- lexical-binding: t; -*-
-
- ;; Copyright (C) 2015 John Kitchin
-
- ;; Author: John Kitchin <jkitchin@andrew.cmu.edu>
- ;; Keywords:
-
- ;; This program is free software; you can redistribute it and/or modify
- ;; it under the terms of the GNU General Public License as published by
- ;; the Free Software Foundation, either version 3 of the License, or
- ;; (at your option) any later version.
-
- ;; This program is distributed in the hope that it will be useful,
- ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;; GNU General Public License for more details.
-
- ;; You should have received a copy of the GNU General Public License
- ;; along with this program. If not, see <http://www.gnu.org/licenses/>.
-
- ;;; Commentary:
-
- ;; Drag a webpage onto a bibtex file to insert a bibtex entry.
-
- ;; This works by scraping DOIs from the content at the URL using patterns in
- ;; `org-ref-doi-regexps'. If one DOI is found, it is added as an entry. If
- ;; multiple DOIs are found, you will get a helm selection buffer to choose what
- ;; you want to add. You can add new patterns to `org-ref-doi-regexps'.
-
- ;; You can press Control to "debug" a URL, which will open a buffer of the
- ;; content with the current DOI patterns highlighted. If you want to get all the
- ;; DOIs at a URL, you can press Meta during the drag-n-drop.
-
- ;; You can also insert formatted bibtex entries using the
- ;; `org-ref-url-html-to-bibtex' command, which converts a web page to
- ;; bibtex or biblatex entry using URL. The org-cliplink package can
- ;; help cleanup HTML code. Installing it is recommended.
-
- ;;; Code:
- (defvar org-ref-bibliography-entry-format)
- (defvar org-ref-get-pdf-filename-function)
- (defvar org-ref-notes-function)
- (defvar org-ref-cite-types)
- (defvar org-cliplink-escape-alist)
-
- (declare-function 'org-ref-get-bibtex-key-and-file "org-ref-core.el")
- (declare-function 'org-ref-find-bibliography "org-ref-core.el")
- (declare-function 'org-ref-key-in-file-p "org-ref-core.el")
- (declare-function 'org-ref-bib-citation "org-ref-core.el")
- (declare-function 'org-ref-get-bibtex-key-under-cursor "org-ref-core.el")
-
- (require 'doi-utils)
- (require 'f)
- (require 's)
- (eval-when-compile
- (require 'cl-lib))
-
- (defgroup org-ref-url nil
- "Customization group for org-ref-url-utils"
- :tag "Org Ref URL"
- :group 'org-ref-url-utils)
-
-
- (defcustom org-ref-doi-regexps
- '("scheme=\"doi\" content=\"\\([^\"]*\\)\""
- "citation_doi\" content=\"\\([^\"]*\\)\""
- "data-doi=\"\\([^\"]*\\)\""
- "content=\"\\([^\"]*\\)\" name=\"citation_doi"
- "objectDOI\" : \"\\([^\"]*\\)\""
- "doi = '\\([^']*\\)'"
- "\"http://dx.doi.org/\\([^\"]*\\)\""
- "/doi/\\([^\"]*\\)\">"
- "doi/full/\\(.*\\)&"
- "doi=\\([^&]*\\)&")
- "List of regexps to match a DOI.
- The doi should be in group 1 so that (match-string 1) contains
- the DOI."
- :type '(repeat regexp)
- :group 'org-ref-url-utils)
-
- (defvar org-ref-url-title-re
- "<title.?+?>\\([[:ascii:][:nonascii:]]*?\\|.+\\)</title>"
- "Regular expression for matching title.")
-
- (defvar org-ref-url-author-re
- "<meta name=\"author\" content=\"\\(.+\\)\"\s?/?>"
- "Regular expression for matching author.")
-
- (defvar org-ref-url-date-re
- "<[a-z].+ class=\\(.?+date.[^>]*\\)>\\([[:ascii:][:nonascii:]]*?\\)</[a-z].+>"
- "Regular expression for matching date.")
-
- (defvar org-ref-url-bibtex-template
- "@misc{key,
- title = {${:title}},
- author = {${:author}},
- howpublished = {${:url}},
- year = {${:year}},
- note = {Online; accessed ${:urldate}}
- }"
- "BibTeX entry template for online sources.")
-
- (defvar org-ref-url-biblatex-template
- "@online{key,
- title = {${:title}},
- author = {${:author}},
- url = {${:url}}
- year = {${:year}},
- urldate = {Online; accessed ${:urldate}}
- }"
- "Biblatex entry template for online sources.")
-
-
- ;;* Scrape DOIs from a URL
-
- (defun org-ref-url-scrape-dois (url)
- "Scrape all dois from a URL matching a pattern in `org-ref-doi-regexps'.
- Returns a list of collected DOIs in the order found."
- (let ((dois '()))
- (with-current-buffer (url-retrieve-synchronously url)
- (cl-loop for doi-pattern in org-ref-doi-regexps
- do
- (goto-char (point-min))
- (while (re-search-forward doi-pattern nil t)
- (cl-pushnew (match-string 1) dois :test #'equal)))
- (reverse dois))))
-
-
- (defun org-ref-url-add-doi-entries (_)
- "Add all entries for CANDIDATE in `helm-marked-candidates'.
- This is used in a helm selection command in `org-ref-url-dnd-protocol'."
- (cl-loop for doi in (helm-marked-candidates)
- do
- (doi-utils-add-bibtex-entry-from-doi
- doi
- (buffer-file-name))
- ;; this removes two blank lines before each entry.
- (bibtex-beginning-of-entry)
- (delete-char -2)))
-
-
- (defun org-ref-url-dnd-protocol (url action)
- "Protocol function for use in `dnd-protocol-alist'.
- We scrape DOIs from the url first. If there is one, we add it. If
- there is more than one, we offer a helm buffer of selections. If
- no DOI is found, we create a misc entry, with a prompt for a key."
- ;; make sure we are on a bib-file
- (if (and (buffer-file-name)
- (f-ext? (buffer-file-name) "bib"))
- (let ((dois (org-ref-url-scrape-dois url)))
- (cond
- ;; One doi found. Assume it is what we want.
- ((= 1 (length dois))
- (doi-utils-add-bibtex-entry-from-doi
- (car dois)
- (buffer-file-name))
- action)
- ;; Multiple DOIs found
- ((> (length dois) 1)
- (helm :sources
- `((name . "Select a DOI")
- (candidates . ,(let ((dois '()))
- (with-current-buffer (url-retrieve-synchronously url)
- (cl-loop for doi-pattern in org-ref-doi-regexps
- do
- (goto-char (point-min))
- (while (re-search-forward doi-pattern nil t)
- (cl-pushnew
- ;; Cut off the doi, sometimes
- ;; false matches are long.
- (cons (format "%40s | %s"
- (substring
- (match-string 1)
- 0 (min
- (length (match-string 1))
- 40))
- doi-pattern)
- (match-string 1))
- dois
- :test #'equal)))
- (reverse dois))))
- (action . org-ref-url-add-doi-entries)))
- action)
- ;; No DOIs found, add a misc entry.
- (t
- (goto-char (point-max))
- (insert (format "\n@misc{,
- url = {%s},
- note = {Last accessed %s}
- }"
- url
- (current-time-string)))
- (bibtex-clean-entry)
- action)))
- ;; pass back to dnd. Copied from `org-download-dnd'. Apparently
- ;; returning nil does not do this.
- (let ((dnd-protocol-alist
- (rassq-delete-all
- 'org-ref-url-dnd-protocol
- (copy-alist dnd-protocol-alist))))
- (dnd-handle-one-url nil action url))))
-
-
- (add-to-list 'dnd-protocol-alist '("^https?" . org-ref-url-dnd-protocol))
-
- ;;* Enable a DOI to be dragged onto a bibtex buffer
- (defun org-ref-doi-dnd-protocol (doi action)
- "Protocol for when a doi is dragged onto a bibtex file.
- A doi will be either doi:10.xxx or 10.xxx."
- (if (and (buffer-file-name)
- (f-ext? (buffer-file-name) "bib"))
- (let ((doi (dnd-unescape-uri doi)))
- ;; Get the actual doi now
- (string-match "\\(?:DOI\\|doi\\)?:? *\\(10.*\\)" doi)
- (setq doi (match-string 1 doi))
- (when doi
- (doi-add-bibtex-entry doi (buffer-file-name))
- (save-buffer)
- action))
- ;; not on a bib file
- (let ((dnd-protocol-alist
- (rassq-delete-all
- 'org-ref-url-dnd-protocol
- (copy-alist dnd-protocol-alist))))
- (dnd-handle-one-url nil action doi))))
-
- (add-to-list 'dnd-protocol-alist '("^doi" . org-ref-doi-dnd-protocol))
- (add-to-list 'dnd-protocol-alist '("^10" . org-ref-doi-dnd-protocol))
-
-
- ;;* Debug URL in a buffer with C-dnd
-
- ;; You can use this to see if there are any DOIs in a URL, and to use re-builder
- ;; to add new patterns to `org-ref-doi-regexps'.
- ;;;###autoload
- (defun org-ref-url-debug-url (url)
- "Open a buffer to URL with all doi patterns highlighted."
- (interactive)
- (switch-to-buffer
- (url-retrieve-synchronously url))
- (highlight-regexp
- (mapconcat 'identity org-ref-doi-regexps "\\|")))
-
-
- ;;;###autoload
- (defun org-ref-url-dnd-debug (event)
- "Drag-n-drop function to debug a url."
- (interactive "e")
- (org-ref-url-debug-url (cadr (car (last event)))))
-
-
- (define-key bibtex-mode-map (kbd "<C-drag-n-drop>") 'org-ref-url-dnd-debug)
-
- ;;* Add all DOI bibtex entries with M-dnd
-
- (defun org-ref-url-add-all-doi-entries (url)
- "Add all DOI bibtex entries for URL."
- (cl-loop for doi in (org-ref-url-scrape-dois url)
- do
- (ignore-errors
- (doi-utils-add-bibtex-entry-from-doi
- doi
- (buffer-file-name))
- ;; this removes two blank lines before each entry.
- (bibtex-beginning-of-entry)
- (delete-char -2))))
-
-
- ;;;###autoload
- (defun org-ref-url-dnd-all (event)
- "Drag-n-drop function to get all DOI bibtex entries for a url.
- You probably do not want to do this since the DOI patterns are
- not perfect, and some hits are not actually DOIs."
- (interactive "e")
- (org-ref-url-add-all-doi-entries (cadr (car (last event)))))
-
- (define-key bibtex-mode-map (kbd "<M-drag-n-drop>") 'org-ref-url-dnd-all)
-
-
- ;; Get first DOI if there is one with s-dnd
- (defun org-ref-url-add-first-doi-entry (url)
- "Add first DOI bibtex entry for URL if there is one."
- (let* ((dois (org-ref-url-scrape-dois url))
- (doi (car dois)))
- (if doi
- (progn
- (doi-utils-add-bibtex-entry-from-doi
- doi
- (buffer-file-name))
- ;; this removes two blank lines before each entry.
- (bibtex-beginning-of-entry)
- (delete-char -2))
- ;; no doi, add misc
- (goto-char (point-max))
- (insert (format "\n@misc{,
- url = {%s},
- note = {Last accessed %s}
- }"
- url
- (current-time-string)))
- (bibtex-clean-entry))))
-
-
- ;;;###autoload
- (defun org-ref-url-dnd-first (event)
- "Drag-n-drop function to download the first DOI in a url."
- (interactive "e")
- (org-ref-url-add-first-doi-entry (cadr (car (last event)))))
-
- (define-key bibtex-mode-map (kbd "<s-drag-n-drop>") 'org-ref-url-dnd-first)
-
-
- ;; HTML to BibTeX functions
-
- (defun org-ref-url-html-replace (string)
- "Replace HTML entities in STRING with their unicode equivalent."
- (let (result
- (case-fold-search nil))
- (with-temp-buffer
- (insert string)
- (mapc (lambda (char)
- (goto-char (point-min))
- (while (re-search-forward (car char) nil t)
- (replace-match (cdr char))))
- org-cliplink-escape-alist)
- (setq result (buffer-substring (point-min) (point-max))))
- result))
-
-
- (defun org-ref-url-add-nil (list)
- "Add nil to all missing keys in LIST."
- (let (newlist)
- (mapc (lambda (key)
- (unless (alist-get key list)
- (push (cons key "nil") newlist)))
- (list :title :author :url :urldate :year))
- (append list newlist)))
-
-
- (defun org-ref-url-html-read (url)
- "Read URL content and return fields.
- Fields include author, title, url, urldate, and year."
- ;; Start with fields we already know
- (let ((fields `((:url . ,url)
- (:urldate . ,(format-time-string "%d %B %Y")))))
- (with-current-buffer
- (url-retrieve-synchronously url t t)
-
- ;; find pubdate
- (goto-char (point-min))
- (when (re-search-forward org-ref-url-date-re nil t)
- (let ((string (match-string 2)))
- (when (string-match "\\([0-9]\\{4\\}\\)" string)
- (push (cons :year (match-string 1 string)) fields))))
-
- ;; find author
- (goto-char (point-min))
- (when (re-search-forward org-ref-url-author-re nil t)
- (push (cons :author (match-string 1)) fields))
-
- ;; find title
- (goto-char (point-min))
- (when (re-search-forward org-ref-url-title-re nil t)
- (push (cons :title
- (s-trim (decode-coding-string (match-string 1) 'utf-8)))
- fields)))
-
- ;; Finally add nil value to missing fields
- (org-ref-url-add-nil fields)))
-
-
- ;;;###autoload
- (defun org-ref-url-html-to-bibtex (bibfile &optional url)
- "Convert URL to a bibtex or biblatex entry in BIBFILE.
- If URL is the first in the kill ring, use it. Otherwise, prompt for
- one in the minibuffer."
- (interactive (if (-contains? (org-ref-find-bibliography) (buffer-file-name))
- (list (buffer-file-name))
- (list (completing-read "Bibtex file: " (org-ref-find-bibliography)))))
- (let ((url (if url url
- (if (s-match "^http" (current-kill 0 'do-not-move))
- (format "%s" (current-kill 0 'do-not-move))
- (read-from-minibuffer "URL: ")))))
- (with-current-buffer
- (find-file-noselect bibfile)
- ;; Maybe check dialect if set as local variable
- (let* ((dialect bibtex-dialect)
- (alist (org-ref-url-html-read url))
- (entry (s-format
- ;; Check dialect and format entry accordingly
- (if (eq dialect 'biblatex)
- org-ref-url-biblatex-template
- org-ref-url-bibtex-template)
- 'aget alist)))
- (goto-char (point-max))
- ;; Place new entry one line after the last entry.
- (while (not (looking-back "^}\n" 2))
- (delete-char -1))
- (insert "\n")
- (insert (if (require 'org-cliplink nil 'noerror)
- ;; Sanitize values by replacing html entities
- (org-ref-url-html-replace entry)
- entry))
- (bibtex-beginning-of-entry)
- (org-ref-clean-bibtex-entry)))))
-
- (provide 'org-ref-url-utils)
- ;;; org-ref-url-utils.el ends here
|