forked from skeeto/elfeed
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xml-query.el
231 lines (183 loc) · 8.06 KB
/
xml-query.el
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
;;; xml-query.el --- query engine complimenting the xml package
;; This is free and unencumbered software released into the public domain.
;;; Commentary:
;; This provides a very rudimentary, jQuery-like, XML selector
;; s-expression language. It operates on the output of the xml
;; package, such as `xml-parse-region' and `xml-parse-file'. It was
;; written to support Elfeed.
;; See the docstring for `xml-query-all'.
;; The macro forms, `xml-query*' and `xml-query-all*', are an order of
;; magnitude faster, but only work on static selectors and need the
;; namespaces to be pre-stripped.
;; Examples:
;; This query grabs the top-level paragraph content from XHTML.
;; (xml-query-all '(html body p *) xhtml)
;; This query extracts all the links from an Atom feed.
;; (xml-query-all '(feed entry link [rel "alternate"] :href) xml)
;;; Code:
(require 'cl-lib)
(defun xml-query-strip-ns (tag)
"Remove the namespace, if any, from TAG."
(when (symbolp tag)
(let ((name (symbol-name tag)))
(if (cl-find ?\: name)
(intern (replace-regexp-in-string "^.+:" "" name))
tag))))
(defun xml-query--tag-all (match xml)
(cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml)
when (or (eq tag match) (eq (xml-query-strip-ns tag) match))
collect (cons tag (cons attribs content))))
(defun xml-query--attrib-all (attrib value xml)
(cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml)
when (equal (cdr (assoc attrib attribs)) value)
collect (cons tag (cons attribs content))))
(defun xml-query--keyword (matcher xml)
(cl-loop with match = (intern (substring (symbol-name matcher) 1))
for (tag attribs . content) in (cl-remove-if-not #'listp xml)
when (cdr (assoc match attribs))
collect it))
(defun xml-query--symbol (matcher xml)
(xml-query--tag-all matcher xml))
(defun xml-query--vector (matcher xml)
(let ((attrib (aref matcher 0))
(value (aref matcher 1)))
(xml-query--attrib-all attrib value xml)))
(defun xml-query--list (matchers xml)
(cl-loop for matcher in matchers
append (xml-query-all (if (listp matcher)
matcher
(list matcher)) xml)))
(defun xml-query--append (xml)
(cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml)
append content))
(defun xml-query--stringp (thing)
"Return non-nil of THING is a non-blank string."
(and (stringp thing) (string-match "[^ \t\r\n]" thing)))
(defun xml-query-all (query xml)
"Given a list of tags, XML, apply QUERY and return a list of
matching tags.
A query is a list of matchers.
- SYMBOL: filters to matching tags
- LIST: each element is a full sub-query, whose results are concatenated
- VECTOR: filters to tags with matching attribute, [tag attrib value]
- KEYWORD: filters to an attribute value (must be last)
- * (an asterisk symbol): filters to content strings (must be last)
For example, to find all the 'alternate' link URL in a typical
Atom feed:
(xml-query-all '(feed entry link [rel \"alternate\"] :href) xml)"
(if (null query)
xml
(cl-destructuring-bind (matcher . rest) query
(cond
((keywordp matcher) (xml-query--keyword matcher xml))
((eq matcher '*)
(cl-remove-if-not #'xml-query--stringp (xml-query--append xml)))
(:else
(let ((matches
(cl-etypecase matcher
(symbol (xml-query--symbol matcher xml))
(vector (xml-query--vector matcher xml))
(list (xml-query--list matcher xml)))))
(cond
((null rest) matches)
((and (or (symbolp (car rest))
(listp (car rest)))
(not (keywordp (car rest)))
(not (eq '* (car rest))))
(xml-query-all (cdr query) (xml-query--append matches)))
(:else (xml-query-all rest matches)))))))))
(defun xml-query (query xml)
"Like `xml-query-all' but only return the first result."
(let ((result (xml-query-all query xml)))
(if (xml-query--stringp result)
result
(car (xml-query-all query xml)))))
;; Macro alternatives:
;; This is a slightly less capable alternative with significantly
;; better performance (x10 speedup) that requires a static selector.
;; The selector is compiled into Lisp code via macro at compile-time,
;; which is then carried through to byte-code by the compiler. In
;; byte-code form, the macro performs no function calls other than
;; `throw' in the case of `xml-query*', where it's invoked less than
;; once per evaluation (only on success).
;; Queries are compiled tail-to-head with a result handler at the
;; deepest level. The generated code makes multiple bindings of the
;; variable "v" as it dives deeper into the query, using the layers of
;; bindings as a breadcrumb stack.
;; For `xml-query*', which has a single result, the whole expression
;; is wrapped in a catch, and the first successful match is thrown to
;; it from the result handler.
;; For `xml-query-all*', the result is pushed into an output list.
(defun xml-query--compile-tag (tag subexp subloop-p)
`(when (and (consp v) (eq (car v) ',tag))
,(if subloop-p
`(dolist (v (cddr v))
,subexp)
subexp)))
(defun xml-query--compile-attrib (pair subexp subloop-p)
`(let ((value (cdr (assq ',(aref pair 0) (cadr v)))))
(when (equal value ,(aref pair 1))
,(if subloop-p
`(dolist (v (cddr v))
,subexp)
subexp))))
(defun xml-query--compile-keyword (keyword subexp)
(let ((attrib (intern (substring (symbol-name keyword) 1))))
`(let ((v (cdr (assq ',attrib (cadr v)))))
(when v
,subexp))))
(defun xml-query--compile-star (subexp)
`(when (and (stringp v) (string-match "[^ \t\r\n]" v))
,subexp))
(defun xml-query--compile-top (query input subexp)
(let* ((rquery (reverse query))
(prev nil))
(while rquery
(let ((matcher (pop rquery))
;; Should the next item loop over its children?
(subloop-p (and (not (null prev))
(not (keywordp prev))
(symbolp prev))))
(cond
((eq '* matcher)
(setf subexp (xml-query--compile-star subexp)))
((keywordp matcher)
(setf subexp (xml-query--compile-keyword matcher subexp)))
((symbolp matcher)
(setf subexp (xml-query--compile-tag matcher subexp subloop-p)))
((vectorp matcher)
(setf subexp (xml-query--compile-attrib matcher subexp subloop-p)))
((error "Bad query: %S" query)))
(setf prev matcher)))
`(dolist (v ,input)
,subexp)))
(defun xml-query--compile (query input)
(let ((tag (make-symbol "done")))
`(catch ',tag
,(xml-query--compile-top query input `(throw ',tag v)))))
(defmacro xml-query* (query sexp)
"Like `xml-query' but generate code to execute QUERY on SEXP.
Unlike `xml-query', QUERY must be a static, compile-time
s-expression. See `xml-query-all*' for more information.
QUERY is *not* evaluated, so it should not be quoted."
(xml-query--compile query sexp))
(defun xml-query-all--compile (query input)
(let ((output (make-symbol "output")))
`(let ((,output ()))
,(xml-query--compile-top query input `(push v ,output))
(nreverse ,output))))
(defmacro xml-query-all* (query sexp)
"Like `xml-query-all' but generate code to execute QUERY on SEXP.
Unlike `xml-query-all', QUERY must be a static, compile-time
s-expression. This macro compiles the query into actual code. The
result is faster since the query will be compiled into byte-code
rather than \"interpreted\" at run time.
Also unlike `xml-query-all', the parsed XML s-expression must
also have its namespace pre-stripped. This is accomplished by
setting the optional PARSE-NS argument of `xml-parse-region' to
symbol-qnames.
Sub-expression lists are not supported by this macro.
QUERY is *not* evaluated, so it should not be quoted."
(xml-query-all--compile query sexp))
(provide 'xml-query)
;;; xml-query.el ends here