;;; org-ref-url-utils.el --- Utility functions to scrape DOIs from urls -*- lexical-binding: t; -*- ;; Copyright (C) 2015 John Kitchin ;; Author: John Kitchin ;; Keywords: ;; This program is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with this program. If not, see . ;;; Commentary: ;; Drag a webpage onto a bibtex file to insert a bibtex entry. ;; This works by scraping DOIs from the content at the URL using patterns in ;; `org-ref-doi-regexps'. If one DOI is found, it is added as an entry. If ;; multiple DOIs are found, you will get a helm selection buffer to choose what ;; you want to add. You can add new patterns to `org-ref-doi-regexps'. ;; You can press Control to "debug" a URL, which will open a buffer of the ;; content with the current DOI patterns highlighted. If you want to get all the ;; DOIs at a URL, you can press Meta during the drag-n-drop. ;; You can also insert formatted bibtex entries using the ;; `org-ref-url-html-to-bibtex' command, which converts a web page to ;; bibtex or biblatex entry using URL. The org-cliplink package can ;; help cleanup HTML code. Installing it is recommended. ;;; Code: (defvar org-ref-bibliography-entry-format) (defvar org-ref-get-pdf-filename-function) (defvar org-ref-notes-function) (defvar org-ref-cite-types) (defvar org-cliplink-escape-alist) (declare-function 'org-ref-get-bibtex-key-and-file "org-ref-core.el") (declare-function 'org-ref-find-bibliography "org-ref-core.el") (declare-function 'org-ref-key-in-file-p "org-ref-core.el") (declare-function 'org-ref-bib-citation "org-ref-core.el") (declare-function 'org-ref-get-bibtex-key-under-cursor "org-ref-core.el") (require 'doi-utils) (require 'f) (require 's) (eval-when-compile (require 'cl-lib)) (defgroup org-ref-url nil "Customization group for org-ref-url-utils" :tag "Org Ref URL" :group 'org-ref-url-utils) (defcustom org-ref-doi-regexps '("scheme=\"doi\" content=\"\\([^\"]*\\)\"" "citation_doi\" content=\"\\([^\"]*\\)\"" "data-doi=\"\\([^\"]*\\)\"" "content=\"\\([^\"]*\\)\" name=\"citation_doi" "objectDOI\" : \"\\([^\"]*\\)\"" "doi = '\\([^']*\\)'" "\"http://dx.doi.org/\\([^\"]*\\)\"" "/doi/\\([^\"]*\\)\">" "doi/full/\\(.*\\)&" "doi=\\([^&]*\\)&") "List of regexps to match a DOI. The doi should be in group 1 so that (match-string 1) contains the DOI." :type '(repeat regexp) :group 'org-ref-url-utils) (defvar org-ref-url-title-re "\\([[:ascii:][:nonascii:]]*?\\|.+\\)" "Regular expression for matching title.") (defvar org-ref-url-author-re "" "Regular expression for matching author.") (defvar org-ref-url-date-re "<[a-z].+ class=\\(.?+date.[^>]*\\)>\\([[:ascii:][:nonascii:]]*?\\)" "Regular expression for matching date.") (defvar org-ref-url-bibtex-template "@misc{key, title = {${:title}}, author = {${:author}}, howpublished = {${:url}}, year = {${:year}}, note = {Online; accessed ${:urldate}} }" "BibTeX entry template for online sources.") (defvar org-ref-url-biblatex-template "@online{key, title = {${:title}}, author = {${:author}}, url = {${:url}} year = {${:year}}, urldate = {Online; accessed ${:urldate}} }" "Biblatex entry template for online sources.") ;;* Scrape DOIs from a URL (defun org-ref-url-scrape-dois (url) "Scrape all dois from a URL matching a pattern in `org-ref-doi-regexps'. Returns a list of collected DOIs in the order found." (let ((dois '())) (with-current-buffer (url-retrieve-synchronously url) (cl-loop for doi-pattern in org-ref-doi-regexps do (goto-char (point-min)) (while (re-search-forward doi-pattern nil t) (cl-pushnew (match-string 1) dois :test #'equal))) (reverse dois)))) (defun org-ref-url-add-doi-entries (_) "Add all entries for CANDIDATE in `helm-marked-candidates'. This is used in a helm selection command in `org-ref-url-dnd-protocol'." (cl-loop for doi in (helm-marked-candidates) do (doi-utils-add-bibtex-entry-from-doi doi (buffer-file-name)) ;; this removes two blank lines before each entry. (bibtex-beginning-of-entry) (delete-char -2))) (defun org-ref-url-dnd-protocol (url action) "Protocol function for use in `dnd-protocol-alist'. We scrape DOIs from the url first. If there is one, we add it. If there is more than one, we offer a helm buffer of selections. If no DOI is found, we create a misc entry, with a prompt for a key." ;; make sure we are on a bib-file (if (and (buffer-file-name) (f-ext? (buffer-file-name) "bib")) (let ((dois (org-ref-url-scrape-dois url))) (cond ;; One doi found. Assume it is what we want. ((= 1 (length dois)) (doi-utils-add-bibtex-entry-from-doi (car dois) (buffer-file-name)) action) ;; Multiple DOIs found ((> (length dois) 1) (helm :sources `((name . "Select a DOI") (candidates . ,(let ((dois '())) (with-current-buffer (url-retrieve-synchronously url) (cl-loop for doi-pattern in org-ref-doi-regexps do (goto-char (point-min)) (while (re-search-forward doi-pattern nil t) (cl-pushnew ;; Cut off the doi, sometimes ;; false matches are long. (cons (format "%40s | %s" (substring (match-string 1) 0 (min (length (match-string 1)) 40)) doi-pattern) (match-string 1)) dois :test #'equal))) (reverse dois)))) (action . org-ref-url-add-doi-entries))) action) ;; No DOIs found, add a misc entry. (t (goto-char (point-max)) (insert (format "\n@misc{, url = {%s}, note = {Last accessed %s} }" url (current-time-string))) (bibtex-clean-entry) action))) ;; pass back to dnd. Copied from `org-download-dnd'. Apparently ;; returning nil does not do this. (let ((dnd-protocol-alist (rassq-delete-all 'org-ref-url-dnd-protocol (copy-alist dnd-protocol-alist)))) (dnd-handle-one-url nil action url)))) (add-to-list 'dnd-protocol-alist '("^https?" . org-ref-url-dnd-protocol)) ;;* Enable a DOI to be dragged onto a bibtex buffer (defun org-ref-doi-dnd-protocol (doi action) "Protocol for when a doi is dragged onto a bibtex file. A doi will be either doi:10.xxx or 10.xxx." (if (and (buffer-file-name) (f-ext? (buffer-file-name) "bib")) (let ((doi (dnd-unescape-uri doi))) ;; Get the actual doi now (string-match "\\(?:DOI\\|doi\\)?:? *\\(10.*\\)" doi) (setq doi (match-string 1 doi)) (when doi (doi-add-bibtex-entry doi (buffer-file-name)) (save-buffer) action)) ;; not on a bib file (let ((dnd-protocol-alist (rassq-delete-all 'org-ref-url-dnd-protocol (copy-alist dnd-protocol-alist)))) (dnd-handle-one-url nil action doi)))) (add-to-list 'dnd-protocol-alist '("^doi" . org-ref-doi-dnd-protocol)) (add-to-list 'dnd-protocol-alist '("^10" . org-ref-doi-dnd-protocol)) ;;* Debug URL in a buffer with C-dnd ;; You can use this to see if there are any DOIs in a URL, and to use re-builder ;; to add new patterns to `org-ref-doi-regexps'. ;;;###autoload (defun org-ref-url-debug-url (url) "Open a buffer to URL with all doi patterns highlighted." (interactive) (switch-to-buffer (url-retrieve-synchronously url)) (highlight-regexp (mapconcat 'identity org-ref-doi-regexps "\\|"))) ;;;###autoload (defun org-ref-url-dnd-debug (event) "Drag-n-drop function to debug a url." (interactive "e") (org-ref-url-debug-url (cadr (car (last event))))) (define-key bibtex-mode-map (kbd "") 'org-ref-url-dnd-debug) ;;* Add all DOI bibtex entries with M-dnd (defun org-ref-url-add-all-doi-entries (url) "Add all DOI bibtex entries for URL." (cl-loop for doi in (org-ref-url-scrape-dois url) do (ignore-errors (doi-utils-add-bibtex-entry-from-doi doi (buffer-file-name)) ;; this removes two blank lines before each entry. (bibtex-beginning-of-entry) (delete-char -2)))) ;;;###autoload (defun org-ref-url-dnd-all (event) "Drag-n-drop function to get all DOI bibtex entries for a url. You probably do not want to do this since the DOI patterns are not perfect, and some hits are not actually DOIs." (interactive "e") (org-ref-url-add-all-doi-entries (cadr (car (last event))))) (define-key bibtex-mode-map (kbd "") 'org-ref-url-dnd-all) ;; Get first DOI if there is one with s-dnd (defun org-ref-url-add-first-doi-entry (url) "Add first DOI bibtex entry for URL if there is one." (let* ((dois (org-ref-url-scrape-dois url)) (doi (car dois))) (if doi (progn (doi-utils-add-bibtex-entry-from-doi doi (buffer-file-name)) ;; this removes two blank lines before each entry. (bibtex-beginning-of-entry) (delete-char -2)) ;; no doi, add misc (goto-char (point-max)) (insert (format "\n@misc{, url = {%s}, note = {Last accessed %s} }" url (current-time-string))) (bibtex-clean-entry)))) ;;;###autoload (defun org-ref-url-dnd-first (event) "Drag-n-drop function to download the first DOI in a url." (interactive "e") (org-ref-url-add-first-doi-entry (cadr (car (last event))))) (define-key bibtex-mode-map (kbd "") 'org-ref-url-dnd-first) ;; HTML to BibTeX functions (defun org-ref-url-html-replace (string) "Replace HTML entities in STRING with their unicode equivalent." (let (result (case-fold-search nil)) (with-temp-buffer (insert string) (mapc (lambda (char) (goto-char (point-min)) (while (re-search-forward (car char) nil t) (replace-match (cdr char)))) org-cliplink-escape-alist) (setq result (buffer-substring (point-min) (point-max)))) result)) (defun org-ref-url-add-nil (list) "Add nil to all missing keys in LIST." (let (newlist) (mapc (lambda (key) (unless (alist-get key list) (push (cons key "nil") newlist))) (list :title :author :url :urldate :year)) (append list newlist))) (defun org-ref-url-html-read (url) "Read URL content and return fields. Fields include author, title, url, urldate, and year." ;; Start with fields we already know (let ((fields `((:url . ,url) (:urldate . ,(format-time-string "%d %B %Y"))))) (with-current-buffer (url-retrieve-synchronously url t t) ;; find pubdate (goto-char (point-min)) (when (re-search-forward org-ref-url-date-re nil t) (let ((string (match-string 2))) (when (string-match "\\([0-9]\\{4\\}\\)" string) (push (cons :year (match-string 1 string)) fields)))) ;; find author (goto-char (point-min)) (when (re-search-forward org-ref-url-author-re nil t) (push (cons :author (match-string 1)) fields)) ;; find title (goto-char (point-min)) (when (re-search-forward org-ref-url-title-re nil t) (push (cons :title (s-trim (decode-coding-string (match-string 1) 'utf-8))) fields))) ;; Finally add nil value to missing fields (org-ref-url-add-nil fields))) ;;;###autoload (defun org-ref-url-html-to-bibtex (bibfile &optional url) "Convert URL to a bibtex or biblatex entry in BIBFILE. If URL is the first in the kill ring, use it. Otherwise, prompt for one in the minibuffer." (interactive (if (-contains? (org-ref-find-bibliography) (buffer-file-name)) (list (buffer-file-name)) (list (completing-read "Bibtex file: " (org-ref-find-bibliography))))) (let ((url (if url url (if (s-match "^http" (current-kill 0 'do-not-move)) (format "%s" (current-kill 0 'do-not-move)) (read-from-minibuffer "URL: "))))) (with-current-buffer (find-file-noselect bibfile) ;; Maybe check dialect if set as local variable (let* ((dialect bibtex-dialect) (alist (org-ref-url-html-read url)) (entry (s-format ;; Check dialect and format entry accordingly (if (eq dialect 'biblatex) org-ref-url-biblatex-template org-ref-url-bibtex-template) 'aget alist))) (goto-char (point-max)) ;; Place new entry one line after the last entry. (while (not (looking-back "^}\n" 2)) (delete-char -1)) (insert "\n") (insert (if (require 'org-cliplink nil 'noerror) ;; Sanitize values by replacing html entities (org-ref-url-html-replace entry) entry)) (bibtex-beginning-of-entry) (org-ref-clean-bibtex-entry))))) (provide 'org-ref-url-utils) ;;; org-ref-url-utils.el ends here