Klimi's new dotfiles with stow.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

408 lines
13 KiB

5 years ago
  1. ;;; org-ref-url-utils.el --- Utility functions to scrape DOIs from urls -*- lexical-binding: t; -*-
  2. ;; Copyright (C) 2015 John Kitchin
  3. ;; Author: John Kitchin <jkitchin@andrew.cmu.edu>
  4. ;; Keywords:
  5. ;; This program is free software; you can redistribute it and/or modify
  6. ;; it under the terms of the GNU General Public License as published by
  7. ;; the Free Software Foundation, either version 3 of the License, or
  8. ;; (at your option) any later version.
  9. ;; This program is distributed in the hope that it will be useful,
  10. ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. ;; GNU General Public License for more details.
  13. ;; You should have received a copy of the GNU General Public License
  14. ;; along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ;;; Commentary:
  16. ;; Drag a webpage onto a bibtex file to insert a bibtex entry.
  17. ;; This works by scraping DOIs from the content at the URL using patterns in
  18. ;; `org-ref-doi-regexps'. If one DOI is found, it is added as an entry. If
  19. ;; multiple DOIs are found, you will get a helm selection buffer to choose what
  20. ;; you want to add. You can add new patterns to `org-ref-doi-regexps'.
  21. ;; You can press Control to "debug" a URL, which will open a buffer of the
  22. ;; content with the current DOI patterns highlighted. If you want to get all the
  23. ;; DOIs at a URL, you can press Meta during the drag-n-drop.
  24. ;; You can also insert formatted bibtex entries using the
  25. ;; `org-ref-url-html-to-bibtex' command, which converts a web page to
  26. ;; bibtex or biblatex entry using URL. The org-cliplink package can
  27. ;; help cleanup HTML code. Installing it is recommended.
  28. ;;; Code:
  29. (defvar org-ref-bibliography-entry-format)
  30. (defvar org-ref-get-pdf-filename-function)
  31. (defvar org-ref-notes-function)
  32. (defvar org-ref-cite-types)
  33. (defvar org-cliplink-escape-alist)
  34. (declare-function 'org-ref-get-bibtex-key-and-file "org-ref-core.el")
  35. (declare-function 'org-ref-find-bibliography "org-ref-core.el")
  36. (declare-function 'org-ref-key-in-file-p "org-ref-core.el")
  37. (declare-function 'org-ref-bib-citation "org-ref-core.el")
  38. (declare-function 'org-ref-get-bibtex-key-under-cursor "org-ref-core.el")
  39. (require 'doi-utils)
  40. (require 'f)
  41. (require 's)
  42. (eval-when-compile
  43. (require 'cl-lib))
  44. (defgroup org-ref-url nil
  45. "Customization group for org-ref-url-utils"
  46. :tag "Org Ref URL"
  47. :group 'org-ref-url-utils)
  48. (defcustom org-ref-doi-regexps
  49. '("scheme=\"doi\" content=\"\\([^\"]*\\)\""
  50. "citation_doi\" content=\"\\([^\"]*\\)\""
  51. "data-doi=\"\\([^\"]*\\)\""
  52. "content=\"\\([^\"]*\\)\" name=\"citation_doi"
  53. "objectDOI\" : \"\\([^\"]*\\)\""
  54. "doi = '\\([^']*\\)'"
  55. "\"http://dx.doi.org/\\([^\"]*\\)\""
  56. "/doi/\\([^\"]*\\)\">"
  57. "doi/full/\\(.*\\)&"
  58. "doi=\\([^&]*\\)&amp")
  59. "List of regexps to match a DOI.
  60. The doi should be in group 1 so that (match-string 1) contains
  61. the DOI."
  62. :type '(repeat regexp)
  63. :group 'org-ref-url-utils)
  64. (defvar org-ref-url-title-re
  65. "<title.?+?>\\([[:ascii:][:nonascii:]]*?\\|.+\\)</title>"
  66. "Regular expression for matching title.")
  67. (defvar org-ref-url-author-re
  68. "<meta name=\"author\" content=\"\\(.+\\)\"\s?/?>"
  69. "Regular expression for matching author.")
  70. (defvar org-ref-url-date-re
  71. "<[a-z].+ class=\\(.?+date.[^>]*\\)>\\([[:ascii:][:nonascii:]]*?\\)</[a-z].+>"
  72. "Regular expression for matching date.")
  73. (defvar org-ref-url-bibtex-template
  74. "@misc{key,
  75. title = {${:title}},
  76. author = {${:author}},
  77. howpublished = {${:url}},
  78. year = {${:year}},
  79. note = {Online; accessed ${:urldate}}
  80. }"
  81. "BibTeX entry template for online sources.")
  82. (defvar org-ref-url-biblatex-template
  83. "@online{key,
  84. title = {${:title}},
  85. author = {${:author}},
  86. url = {${:url}}
  87. year = {${:year}},
  88. urldate = {Online; accessed ${:urldate}}
  89. }"
  90. "Biblatex entry template for online sources.")
  91. ;;* Scrape DOIs from a URL
  92. (defun org-ref-url-scrape-dois (url)
  93. "Scrape all dois from a URL matching a pattern in `org-ref-doi-regexps'.
  94. Returns a list of collected DOIs in the order found."
  95. (let ((dois '()))
  96. (with-current-buffer (url-retrieve-synchronously url)
  97. (cl-loop for doi-pattern in org-ref-doi-regexps
  98. do
  99. (goto-char (point-min))
  100. (while (re-search-forward doi-pattern nil t)
  101. (cl-pushnew (match-string 1) dois :test #'equal)))
  102. (reverse dois))))
  103. (defun org-ref-url-add-doi-entries (_)
  104. "Add all entries for CANDIDATE in `helm-marked-candidates'.
  105. This is used in a helm selection command in `org-ref-url-dnd-protocol'."
  106. (cl-loop for doi in (helm-marked-candidates)
  107. do
  108. (doi-utils-add-bibtex-entry-from-doi
  109. doi
  110. (buffer-file-name))
  111. ;; this removes two blank lines before each entry.
  112. (bibtex-beginning-of-entry)
  113. (delete-char -2)))
  114. (defun org-ref-url-dnd-protocol (url action)
  115. "Protocol function for use in `dnd-protocol-alist'.
  116. We scrape DOIs from the url first. If there is one, we add it. If
  117. there is more than one, we offer a helm buffer of selections. If
  118. no DOI is found, we create a misc entry, with a prompt for a key."
  119. ;; make sure we are on a bib-file
  120. (if (and (buffer-file-name)
  121. (f-ext? (buffer-file-name) "bib"))
  122. (let ((dois (org-ref-url-scrape-dois url)))
  123. (cond
  124. ;; One doi found. Assume it is what we want.
  125. ((= 1 (length dois))
  126. (doi-utils-add-bibtex-entry-from-doi
  127. (car dois)
  128. (buffer-file-name))
  129. action)
  130. ;; Multiple DOIs found
  131. ((> (length dois) 1)
  132. (helm :sources
  133. `((name . "Select a DOI")
  134. (candidates . ,(let ((dois '()))
  135. (with-current-buffer (url-retrieve-synchronously url)
  136. (cl-loop for doi-pattern in org-ref-doi-regexps
  137. do
  138. (goto-char (point-min))
  139. (while (re-search-forward doi-pattern nil t)
  140. (cl-pushnew
  141. ;; Cut off the doi, sometimes
  142. ;; false matches are long.
  143. (cons (format "%40s | %s"
  144. (substring
  145. (match-string 1)
  146. 0 (min
  147. (length (match-string 1))
  148. 40))
  149. doi-pattern)
  150. (match-string 1))
  151. dois
  152. :test #'equal)))
  153. (reverse dois))))
  154. (action . org-ref-url-add-doi-entries)))
  155. action)
  156. ;; No DOIs found, add a misc entry.
  157. (t
  158. (goto-char (point-max))
  159. (insert (format "\n@misc{,
  160. url = {%s},
  161. note = {Last accessed %s}
  162. }"
  163. url
  164. (current-time-string)))
  165. (bibtex-clean-entry)
  166. action)))
  167. ;; pass back to dnd. Copied from `org-download-dnd'. Apparently
  168. ;; returning nil does not do this.
  169. (let ((dnd-protocol-alist
  170. (rassq-delete-all
  171. 'org-ref-url-dnd-protocol
  172. (copy-alist dnd-protocol-alist))))
  173. (dnd-handle-one-url nil action url))))
  174. (add-to-list 'dnd-protocol-alist '("^https?" . org-ref-url-dnd-protocol))
  175. ;;* Enable a DOI to be dragged onto a bibtex buffer
  176. (defun org-ref-doi-dnd-protocol (doi action)
  177. "Protocol for when a doi is dragged onto a bibtex file.
  178. A doi will be either doi:10.xxx or 10.xxx."
  179. (if (and (buffer-file-name)
  180. (f-ext? (buffer-file-name) "bib"))
  181. (let ((doi (dnd-unescape-uri doi)))
  182. ;; Get the actual doi now
  183. (string-match "\\(?:DOI\\|doi\\)?:? *\\(10.*\\)" doi)
  184. (setq doi (match-string 1 doi))
  185. (when doi
  186. (doi-add-bibtex-entry doi (buffer-file-name))
  187. (save-buffer)
  188. action))
  189. ;; not on a bib file
  190. (let ((dnd-protocol-alist
  191. (rassq-delete-all
  192. 'org-ref-url-dnd-protocol
  193. (copy-alist dnd-protocol-alist))))
  194. (dnd-handle-one-url nil action doi))))
  195. (add-to-list 'dnd-protocol-alist '("^doi" . org-ref-doi-dnd-protocol))
  196. (add-to-list 'dnd-protocol-alist '("^10" . org-ref-doi-dnd-protocol))
  197. ;;* Debug URL in a buffer with C-dnd
  198. ;; You can use this to see if there are any DOIs in a URL, and to use re-builder
  199. ;; to add new patterns to `org-ref-doi-regexps'.
  200. ;;;###autoload
  201. (defun org-ref-url-debug-url (url)
  202. "Open a buffer to URL with all doi patterns highlighted."
  203. (interactive)
  204. (switch-to-buffer
  205. (url-retrieve-synchronously url))
  206. (highlight-regexp
  207. (mapconcat 'identity org-ref-doi-regexps "\\|")))
  208. ;;;###autoload
  209. (defun org-ref-url-dnd-debug (event)
  210. "Drag-n-drop function to debug a url."
  211. (interactive "e")
  212. (org-ref-url-debug-url (cadr (car (last event)))))
  213. (define-key bibtex-mode-map (kbd "<C-drag-n-drop>") 'org-ref-url-dnd-debug)
  214. ;;* Add all DOI bibtex entries with M-dnd
  215. (defun org-ref-url-add-all-doi-entries (url)
  216. "Add all DOI bibtex entries for URL."
  217. (cl-loop for doi in (org-ref-url-scrape-dois url)
  218. do
  219. (ignore-errors
  220. (doi-utils-add-bibtex-entry-from-doi
  221. doi
  222. (buffer-file-name))
  223. ;; this removes two blank lines before each entry.
  224. (bibtex-beginning-of-entry)
  225. (delete-char -2))))
  226. ;;;###autoload
  227. (defun org-ref-url-dnd-all (event)
  228. "Drag-n-drop function to get all DOI bibtex entries for a url.
  229. You probably do not want to do this since the DOI patterns are
  230. not perfect, and some hits are not actually DOIs."
  231. (interactive "e")
  232. (org-ref-url-add-all-doi-entries (cadr (car (last event)))))
  233. (define-key bibtex-mode-map (kbd "<M-drag-n-drop>") 'org-ref-url-dnd-all)
  234. ;; Get first DOI if there is one with s-dnd
  235. (defun org-ref-url-add-first-doi-entry (url)
  236. "Add first DOI bibtex entry for URL if there is one."
  237. (let* ((dois (org-ref-url-scrape-dois url))
  238. (doi (car dois)))
  239. (if doi
  240. (progn
  241. (doi-utils-add-bibtex-entry-from-doi
  242. doi
  243. (buffer-file-name))
  244. ;; this removes two blank lines before each entry.
  245. (bibtex-beginning-of-entry)
  246. (delete-char -2))
  247. ;; no doi, add misc
  248. (goto-char (point-max))
  249. (insert (format "\n@misc{,
  250. url = {%s},
  251. note = {Last accessed %s}
  252. }"
  253. url
  254. (current-time-string)))
  255. (bibtex-clean-entry))))
  256. ;;;###autoload
  257. (defun org-ref-url-dnd-first (event)
  258. "Drag-n-drop function to download the first DOI in a url."
  259. (interactive "e")
  260. (org-ref-url-add-first-doi-entry (cadr (car (last event)))))
  261. (define-key bibtex-mode-map (kbd "<s-drag-n-drop>") 'org-ref-url-dnd-first)
  262. ;; HTML to BibTeX functions
  263. (defun org-ref-url-html-replace (string)
  264. "Replace HTML entities in STRING with their unicode equivalent."
  265. (let (result
  266. (case-fold-search nil))
  267. (with-temp-buffer
  268. (insert string)
  269. (mapc (lambda (char)
  270. (goto-char (point-min))
  271. (while (re-search-forward (car char) nil t)
  272. (replace-match (cdr char))))
  273. org-cliplink-escape-alist)
  274. (setq result (buffer-substring (point-min) (point-max))))
  275. result))
  276. (defun org-ref-url-add-nil (list)
  277. "Add nil to all missing keys in LIST."
  278. (let (newlist)
  279. (mapc (lambda (key)
  280. (unless (alist-get key list)
  281. (push (cons key "nil") newlist)))
  282. (list :title :author :url :urldate :year))
  283. (append list newlist)))
  284. (defun org-ref-url-html-read (url)
  285. "Read URL content and return fields.
  286. Fields include author, title, url, urldate, and year."
  287. ;; Start with fields we already know
  288. (let ((fields `((:url . ,url)
  289. (:urldate . ,(format-time-string "%d %B %Y")))))
  290. (with-current-buffer
  291. (url-retrieve-synchronously url t t)
  292. ;; find pubdate
  293. (goto-char (point-min))
  294. (when (re-search-forward org-ref-url-date-re nil t)
  295. (let ((string (match-string 2)))
  296. (when (string-match "\\([0-9]\\{4\\}\\)" string)
  297. (push (cons :year (match-string 1 string)) fields))))
  298. ;; find author
  299. (goto-char (point-min))
  300. (when (re-search-forward org-ref-url-author-re nil t)
  301. (push (cons :author (match-string 1)) fields))
  302. ;; find title
  303. (goto-char (point-min))
  304. (when (re-search-forward org-ref-url-title-re nil t)
  305. (push (cons :title
  306. (s-trim (decode-coding-string (match-string 1) 'utf-8)))
  307. fields)))
  308. ;; Finally add nil value to missing fields
  309. (org-ref-url-add-nil fields)))
  310. ;;;###autoload
  311. (defun org-ref-url-html-to-bibtex (bibfile &optional url)
  312. "Convert URL to a bibtex or biblatex entry in BIBFILE.
  313. If URL is the first in the kill ring, use it. Otherwise, prompt for
  314. one in the minibuffer."
  315. (interactive (if (-contains? (org-ref-find-bibliography) (buffer-file-name))
  316. (list (buffer-file-name))
  317. (list (completing-read "Bibtex file: " (org-ref-find-bibliography)))))
  318. (let ((url (if url url
  319. (if (s-match "^http" (current-kill 0 'do-not-move))
  320. (format "%s" (current-kill 0 'do-not-move))
  321. (read-from-minibuffer "URL: ")))))
  322. (with-current-buffer
  323. (find-file-noselect bibfile)
  324. ;; Maybe check dialect if set as local variable
  325. (let* ((dialect bibtex-dialect)
  326. (alist (org-ref-url-html-read url))
  327. (entry (s-format
  328. ;; Check dialect and format entry accordingly
  329. (if (eq dialect 'biblatex)
  330. org-ref-url-biblatex-template
  331. org-ref-url-bibtex-template)
  332. 'aget alist)))
  333. (goto-char (point-max))
  334. ;; Place new entry one line after the last entry.
  335. (while (not (looking-back "^}\n" 2))
  336. (delete-char -1))
  337. (insert "\n")
  338. (insert (if (require 'org-cliplink nil 'noerror)
  339. ;; Sanitize values by replacing html entities
  340. (org-ref-url-html-replace entry)
  341. entry))
  342. (bibtex-beginning-of-entry)
  343. (org-ref-clean-bibtex-entry)))))
  344. (provide 'org-ref-url-utils)
  345. ;;; org-ref-url-utils.el ends here