Klimi's new dotfiles with stow.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

408 lines
13 KiB

;;; org-ref-url-utils.el --- Utility functions to scrape DOIs from urls -*- lexical-binding: t; -*-
;; Copyright (C) 2015 John Kitchin
;; Author: John Kitchin <jkitchin@andrew.cmu.edu>
;; Keywords:
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; Drag a webpage onto a bibtex file to insert a bibtex entry.
;; This works by scraping DOIs from the content at the URL using patterns in
;; `org-ref-doi-regexps'. If one DOI is found, it is added as an entry. If
;; multiple DOIs are found, you will get a helm selection buffer to choose what
;; you want to add. You can add new patterns to `org-ref-doi-regexps'.
;; You can press Control to "debug" a URL, which will open a buffer of the
;; content with the current DOI patterns highlighted. If you want to get all the
;; DOIs at a URL, you can press Meta during the drag-n-drop.
;; You can also insert formatted bibtex entries using the
;; `org-ref-url-html-to-bibtex' command, which converts a web page to
;; bibtex or biblatex entry using URL. The org-cliplink package can
;; help cleanup HTML code. Installing it is recommended.
;;; Code:
(defvar org-ref-bibliography-entry-format)
(defvar org-ref-get-pdf-filename-function)
(defvar org-ref-notes-function)
(defvar org-ref-cite-types)
(defvar org-cliplink-escape-alist)
(declare-function 'org-ref-get-bibtex-key-and-file "org-ref-core.el")
(declare-function 'org-ref-find-bibliography "org-ref-core.el")
(declare-function 'org-ref-key-in-file-p "org-ref-core.el")
(declare-function 'org-ref-bib-citation "org-ref-core.el")
(declare-function 'org-ref-get-bibtex-key-under-cursor "org-ref-core.el")
(require 'doi-utils)
(require 'f)
(require 's)
(eval-when-compile
(require 'cl-lib))
(defgroup org-ref-url nil
"Customization group for org-ref-url-utils"
:tag "Org Ref URL"
:group 'org-ref-url-utils)
(defcustom org-ref-doi-regexps
'("scheme=\"doi\" content=\"\\([^\"]*\\)\""
"citation_doi\" content=\"\\([^\"]*\\)\""
"data-doi=\"\\([^\"]*\\)\""
"content=\"\\([^\"]*\\)\" name=\"citation_doi"
"objectDOI\" : \"\\([^\"]*\\)\""
"doi = '\\([^']*\\)'"
"\"http://dx.doi.org/\\([^\"]*\\)\""
"/doi/\\([^\"]*\\)\">"
"doi/full/\\(.*\\)&"
"doi=\\([^&]*\\)&amp")
"List of regexps to match a DOI.
The doi should be in group 1 so that (match-string 1) contains
the DOI."
:type '(repeat regexp)
:group 'org-ref-url-utils)
(defvar org-ref-url-title-re
"<title.?+?>\\([[:ascii:][:nonascii:]]*?\\|.+\\)</title>"
"Regular expression for matching title.")
(defvar org-ref-url-author-re
"<meta name=\"author\" content=\"\\(.+\\)\"\s?/?>"
"Regular expression for matching author.")
(defvar org-ref-url-date-re
"<[a-z].+ class=\\(.?+date.[^>]*\\)>\\([[:ascii:][:nonascii:]]*?\\)</[a-z].+>"
"Regular expression for matching date.")
(defvar org-ref-url-bibtex-template
"@misc{key,
title = {${:title}},
author = {${:author}},
howpublished = {${:url}},
year = {${:year}},
note = {Online; accessed ${:urldate}}
}"
"BibTeX entry template for online sources.")
(defvar org-ref-url-biblatex-template
"@online{key,
title = {${:title}},
author = {${:author}},
url = {${:url}}
year = {${:year}},
urldate = {Online; accessed ${:urldate}}
}"
"Biblatex entry template for online sources.")
;;* Scrape DOIs from a URL
(defun org-ref-url-scrape-dois (url)
"Scrape all dois from a URL matching a pattern in `org-ref-doi-regexps'.
Returns a list of collected DOIs in the order found."
(let ((dois '()))
(with-current-buffer (url-retrieve-synchronously url)
(cl-loop for doi-pattern in org-ref-doi-regexps
do
(goto-char (point-min))
(while (re-search-forward doi-pattern nil t)
(cl-pushnew (match-string 1) dois :test #'equal)))
(reverse dois))))
(defun org-ref-url-add-doi-entries (_)
"Add all entries for CANDIDATE in `helm-marked-candidates'.
This is used in a helm selection command in `org-ref-url-dnd-protocol'."
(cl-loop for doi in (helm-marked-candidates)
do
(doi-utils-add-bibtex-entry-from-doi
doi
(buffer-file-name))
;; this removes two blank lines before each entry.
(bibtex-beginning-of-entry)
(delete-char -2)))
(defun org-ref-url-dnd-protocol (url action)
"Protocol function for use in `dnd-protocol-alist'.
We scrape DOIs from the url first. If there is one, we add it. If
there is more than one, we offer a helm buffer of selections. If
no DOI is found, we create a misc entry, with a prompt for a key."
;; make sure we are on a bib-file
(if (and (buffer-file-name)
(f-ext? (buffer-file-name) "bib"))
(let ((dois (org-ref-url-scrape-dois url)))
(cond
;; One doi found. Assume it is what we want.
((= 1 (length dois))
(doi-utils-add-bibtex-entry-from-doi
(car dois)
(buffer-file-name))
action)
;; Multiple DOIs found
((> (length dois) 1)
(helm :sources
`((name . "Select a DOI")
(candidates . ,(let ((dois '()))
(with-current-buffer (url-retrieve-synchronously url)
(cl-loop for doi-pattern in org-ref-doi-regexps
do
(goto-char (point-min))
(while (re-search-forward doi-pattern nil t)
(cl-pushnew
;; Cut off the doi, sometimes
;; false matches are long.
(cons (format "%40s | %s"
(substring
(match-string 1)
0 (min
(length (match-string 1))
40))
doi-pattern)
(match-string 1))
dois
:test #'equal)))
(reverse dois))))
(action . org-ref-url-add-doi-entries)))
action)
;; No DOIs found, add a misc entry.
(t
(goto-char (point-max))
(insert (format "\n@misc{,
url = {%s},
note = {Last accessed %s}
}"
url
(current-time-string)))
(bibtex-clean-entry)
action)))
;; pass back to dnd. Copied from `org-download-dnd'. Apparently
;; returning nil does not do this.
(let ((dnd-protocol-alist
(rassq-delete-all
'org-ref-url-dnd-protocol
(copy-alist dnd-protocol-alist))))
(dnd-handle-one-url nil action url))))
(add-to-list 'dnd-protocol-alist '("^https?" . org-ref-url-dnd-protocol))
;;* Enable a DOI to be dragged onto a bibtex buffer
(defun org-ref-doi-dnd-protocol (doi action)
"Protocol for when a doi is dragged onto a bibtex file.
A doi will be either doi:10.xxx or 10.xxx."
(if (and (buffer-file-name)
(f-ext? (buffer-file-name) "bib"))
(let ((doi (dnd-unescape-uri doi)))
;; Get the actual doi now
(string-match "\\(?:DOI\\|doi\\)?:? *\\(10.*\\)" doi)
(setq doi (match-string 1 doi))
(when doi
(doi-add-bibtex-entry doi (buffer-file-name))
(save-buffer)
action))
;; not on a bib file
(let ((dnd-protocol-alist
(rassq-delete-all
'org-ref-url-dnd-protocol
(copy-alist dnd-protocol-alist))))
(dnd-handle-one-url nil action doi))))
(add-to-list 'dnd-protocol-alist '("^doi" . org-ref-doi-dnd-protocol))
(add-to-list 'dnd-protocol-alist '("^10" . org-ref-doi-dnd-protocol))
;;* Debug URL in a buffer with C-dnd
;; You can use this to see if there are any DOIs in a URL, and to use re-builder
;; to add new patterns to `org-ref-doi-regexps'.
;;;###autoload
(defun org-ref-url-debug-url (url)
"Open a buffer to URL with all doi patterns highlighted."
(interactive)
(switch-to-buffer
(url-retrieve-synchronously url))
(highlight-regexp
(mapconcat 'identity org-ref-doi-regexps "\\|")))
;;;###autoload
(defun org-ref-url-dnd-debug (event)
"Drag-n-drop function to debug a url."
(interactive "e")
(org-ref-url-debug-url (cadr (car (last event)))))
(define-key bibtex-mode-map (kbd "<C-drag-n-drop>") 'org-ref-url-dnd-debug)
;;* Add all DOI bibtex entries with M-dnd
(defun org-ref-url-add-all-doi-entries (url)
"Add all DOI bibtex entries for URL."
(cl-loop for doi in (org-ref-url-scrape-dois url)
do
(ignore-errors
(doi-utils-add-bibtex-entry-from-doi
doi
(buffer-file-name))
;; this removes two blank lines before each entry.
(bibtex-beginning-of-entry)
(delete-char -2))))
;;;###autoload
(defun org-ref-url-dnd-all (event)
"Drag-n-drop function to get all DOI bibtex entries for a url.
You probably do not want to do this since the DOI patterns are
not perfect, and some hits are not actually DOIs."
(interactive "e")
(org-ref-url-add-all-doi-entries (cadr (car (last event)))))
(define-key bibtex-mode-map (kbd "<M-drag-n-drop>") 'org-ref-url-dnd-all)
;; Get first DOI if there is one with s-dnd
(defun org-ref-url-add-first-doi-entry (url)
"Add first DOI bibtex entry for URL if there is one."
(let* ((dois (org-ref-url-scrape-dois url))
(doi (car dois)))
(if doi
(progn
(doi-utils-add-bibtex-entry-from-doi
doi
(buffer-file-name))
;; this removes two blank lines before each entry.
(bibtex-beginning-of-entry)
(delete-char -2))
;; no doi, add misc
(goto-char (point-max))
(insert (format "\n@misc{,
url = {%s},
note = {Last accessed %s}
}"
url
(current-time-string)))
(bibtex-clean-entry))))
;;;###autoload
(defun org-ref-url-dnd-first (event)
"Drag-n-drop function to download the first DOI in a url."
(interactive "e")
(org-ref-url-add-first-doi-entry (cadr (car (last event)))))
(define-key bibtex-mode-map (kbd "<s-drag-n-drop>") 'org-ref-url-dnd-first)
;; HTML to BibTeX functions
(defun org-ref-url-html-replace (string)
"Replace HTML entities in STRING with their unicode equivalent."
(let (result
(case-fold-search nil))
(with-temp-buffer
(insert string)
(mapc (lambda (char)
(goto-char (point-min))
(while (re-search-forward (car char) nil t)
(replace-match (cdr char))))
org-cliplink-escape-alist)
(setq result (buffer-substring (point-min) (point-max))))
result))
(defun org-ref-url-add-nil (list)
"Add nil to all missing keys in LIST."
(let (newlist)
(mapc (lambda (key)
(unless (alist-get key list)
(push (cons key "nil") newlist)))
(list :title :author :url :urldate :year))
(append list newlist)))
(defun org-ref-url-html-read (url)
"Read URL content and return fields.
Fields include author, title, url, urldate, and year."
;; Start with fields we already know
(let ((fields `((:url . ,url)
(:urldate . ,(format-time-string "%d %B %Y")))))
(with-current-buffer
(url-retrieve-synchronously url t t)
;; find pubdate
(goto-char (point-min))
(when (re-search-forward org-ref-url-date-re nil t)
(let ((string (match-string 2)))
(when (string-match "\\([0-9]\\{4\\}\\)" string)
(push (cons :year (match-string 1 string)) fields))))
;; find author
(goto-char (point-min))
(when (re-search-forward org-ref-url-author-re nil t)
(push (cons :author (match-string 1)) fields))
;; find title
(goto-char (point-min))
(when (re-search-forward org-ref-url-title-re nil t)
(push (cons :title
(s-trim (decode-coding-string (match-string 1) 'utf-8)))
fields)))
;; Finally add nil value to missing fields
(org-ref-url-add-nil fields)))
;;;###autoload
(defun org-ref-url-html-to-bibtex (bibfile &optional url)
"Convert URL to a bibtex or biblatex entry in BIBFILE.
If URL is the first in the kill ring, use it. Otherwise, prompt for
one in the minibuffer."
(interactive (if (-contains? (org-ref-find-bibliography) (buffer-file-name))
(list (buffer-file-name))
(list (completing-read "Bibtex file: " (org-ref-find-bibliography)))))
(let ((url (if url url
(if (s-match "^http" (current-kill 0 'do-not-move))
(format "%s" (current-kill 0 'do-not-move))
(read-from-minibuffer "URL: ")))))
(with-current-buffer
(find-file-noselect bibfile)
;; Maybe check dialect if set as local variable
(let* ((dialect bibtex-dialect)
(alist (org-ref-url-html-read url))
(entry (s-format
;; Check dialect and format entry accordingly
(if (eq dialect 'biblatex)
org-ref-url-biblatex-template
org-ref-url-bibtex-template)
'aget alist)))
(goto-char (point-max))
;; Place new entry one line after the last entry.
(while (not (looking-back "^}\n" 2))
(delete-char -1))
(insert "\n")
(insert (if (require 'org-cliplink nil 'noerror)
;; Sanitize values by replacing html entities
(org-ref-url-html-replace entry)
entry))
(bibtex-beginning-of-entry)
(org-ref-clean-bibtex-entry)))))
(provide 'org-ref-url-utils)
;;; org-ref-url-utils.el ends here