From 71307deccc112811e1c204a5c030741298d53abd Mon Sep 17 00:00:00 2001 From: Micah Dubinko Date: Tue, 30 Dec 2008 05:52:39 +0000 Subject: [PATCH] RDFa library for XQuery git-svn-id: http://developer.marklogic.com/svn/commons/trunk@925 e04f4502-82db-0310-b1af-f799f365da79 --- rdfa/LICENSE.txt | 203 ++++++++++++++++++++++ rdfa/rdfa.xqy | 391 ++++++++++++++++++++++++++++++++++++++++++ rdfa/rdfa_extract.xqy | 17 ++ 3 files changed, 611 insertions(+) create mode 100644 rdfa/LICENSE.txt create mode 100644 rdfa/rdfa.xqy create mode 100644 rdfa/rdfa_extract.xqy diff --git a/rdfa/LICENSE.txt b/rdfa/LICENSE.txt new file mode 100644 index 0000000..6b0b127 --- /dev/null +++ b/rdfa/LICENSE.txt @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/rdfa/rdfa.xqy b/rdfa/rdfa.xqy new file mode 100644 index 0000000..5ccf313 --- /dev/null +++ b/rdfa/rdfa.xqy @@ -0,0 +1,391 @@ +xquery version "1.0"; + +(: + : Copyright (c) 2008 Mark Logic Corporation. All rights reserved. + : + : Licensed under the Apache License, Version 2.0 (the "License"); + : you may not use this file except in compliance with the License. + : You may obtain a copy of the License at + : + : http://www.apache.org/licenses/LICENSE-2.0 + : + : Unless required by applicable law or agreed to in writing, software + : distributed under the License is distributed on an "AS IS" BASIS, + : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + : See the License for the specific language governing permissions and + : limitations under the License. + :) + +(: default element namespace="http://www.w3.org/1999/xhtml" :) + +module namespace ml = "http://marklogic.com/ns/rdfa-impl#"; +declare default function namespace "http://www.w3.org/2005/xpath-functions"; +declare namespace html = "http://www.w3.org/1999/xhtml"; + +declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + +declare variable $dfvocab := "http://www.w3.org/1999/xhtml/vocab#"; +declare variable $default-base := ("http://BASE.URI"); +declare variable $htmlrels := ( "alternate", + "appendix", + "bookmark", + "cite", + "chapter", + "contents", + "copyright", + "glossary", + "help", + "icon", + "index", + "last", + "license", + "meta", + "next", + "p3pv1", + "prev", + "role", + "section", + "start", + "stylesheet", + "subsection", + "up" ); + +(: we don't have a guarantee that base-uri will be set in non-database docs +, so we have to explicitly pass it in :) +declare function ml:parse_rdfa($doc as node(), $url as xs:string) as element(rdf:RDF) { + + +{ + let $base := if ($doc//html:head/html:base/@href) + then $doc//html:head/html:base/@href + else if ($url) + then $url + else $default-base + for $node in $doc//* + return ( + if ($node/@property) + then ml:property($node, string($node/@property), $base) + else (), + + if ($node/@rel) + then ml:relrev($node, string($node/@rel), "rel", $base) + else (), + + if ($node/@rev) + then ml:relrev($node, string($node/@rev), "rev", $base) + else (), + + if ($node/@typeof) + then ml:typeof($node, string($node/@typeof), $base) + else () + ) +} + +}; + +(: for triples created from this very $node, what is the subject? :) +declare function ml:subject($node as node(), $base as xs:string) { + if ($node/@about) + then ml:safe-resolve-uri-or-curie($node/@about, $node, $base) + else if ($node/@src) + then ml:safe-resolve-uri($node/@src, $base) + else if (local-name($node) = ("head", "body")) + then $base + else if ($node/@typeof) + then ml:generate-bnode-id($node, "typeof") + else ml:subject-ancestor($node/.., $base) +}; + +(: looking down the ancestor chain, what is the subject? :) +declare function ml:subject-ancestor($node as node(), $base as xs:string) { + if ($node/@resource) + then ml:safe-resolve-uri-or-curie($node/@resource, $node, $base) + else if ($node/@href) + then ml:safe-resolve-uri($node/@href, $base) + else if ($node/(@rel | @rev)) + then ml:generate-bnode-id($node) + else if ($node/@about) + then ml:safe-resolve-uri-or-curie($node/@about, $node, $base) + else if ($node/@src) + then ml:safe-resolve-uri($node/@src, $base) + else if ($node/@typeof) + then ml:generate-bnode-id($node, "typeof") + else if ($node/..) + then ml:subject-ancestor($node/.., $base) + else $base +}; + +declare function ml:property($node as node(), $val as xs:string, $base as xs:string) as element()* { + for $prop in if (normalize-space($val) eq "") then () else tokenize($val, "\s+") + let $prefix := substring-before($prop, ":") + let $nsuri := namespace-uri-for-prefix($prefix, $node) + let $isXML := ($node/@datatype and ml:expand-curie($node/@datatype, $node) eq "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral" or + (not($node/@datatype) and $node/node() and $node/(node() except text()) )) + let $effective-dt := if ($node/@datatype and $node/@datatype ne "") + then + if ($isXML) + then false() (: "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral" :) + else ml:expand-curie(($node/@datatype,"")[1], $node) + else false() + let $lang := ($node/ancestor-or-self::*/@xml:lang)[position() eq last()] + let $subj := ml:subject($node, $base) + where 1 + return + if ($subj) + then + + { + if (starts-with($subj, "_")) + then attribute rdf:nodeID { $subj } + else attribute rdf:about { $subj }, + + element { ml:curie-to-qname($prop, $node) } + { + if ($effective-dt) + then attribute rdf:datatype {$effective-dt} + else (), + + $lang, + + (: proper XML Literal?? :) + if ($isXML) + then (attribute rdf:parseType { "Literal" } , for $n in $node/node() return ml:deep-copy($n) ) + else string(if ($node/@content) then $node/@content else $node) + } + } + + else () +}; + +declare function ml:relrev($node as node(), $val as xs:string, $relorrev, $base as xs:string) as element()* { + if ($node/@resource or $node/@href) + then ml:relrev-immed($node, $val, $relorrev, $base) + else (ml:relrev-hanging($node, $val, $relorrev, $base), ml:relrev-hanging-bnode($node, $val, $relorrev, $base)) +}; + +(: Generate an immediate triple (or several, if @rel or @ref is a space-sep list) :) +declare function ml:relrev-immed($node as node(), $val as xs:string, $relorrev, $base as xs:string) as element()* { + for $relv in if (normalize-space($val) eq "") then () else tokenize($val, "\s+") + let $prefix := substring-before($relv, ":") + let $locobj := if ($node/@resource) + then ml:safe-resolve-uri-or-curie($node/@resource, $node, $base) + else ml:safe-resolve-uri($node/@href, $base) + + let $locsbj := ml:subject($node, $base) + let $effective-sbj := if ($relorrev eq "rel") then $locsbj else $locobj + let $effective-obj := if ($relorrev eq "rel") then $locobj else $locsbj + where ml:curie-is-valid($relv, $node) + return + if ($locsbj and $locobj) + then + + { + if (starts-with($effective-sbj, "_")) + then attribute rdf:nodeID { $effective-sbj } + else attribute rdf:about { $effective-sbj }, + + element { ml:curie-to-qname($relv, $node) } + { + if (starts-with($effective-obj, "_")) + then attribute rdf:nodeID { $effective-obj } + else attribute rdf:resource { $effective-obj } + } + } + + else () +}; + +(: Generate potentially multiple, hanging triples :) +declare function ml:relrev-hanging($node as node(), $val as xs:string, $relorrev, $base as xs:string) as element()* { + for $relv in if (normalize-space($val) eq "") then () else tokenize($val, "\s+") + let $prefix := substring-before($relv, ":") + let $locsbj := ml:subject($node, $base) + for $tpl in ml:hanging-descendants($node) + let $locobj := if ($tpl/@about) + then ml:safe-resolve-uri-or-curie($tpl/@about, $tpl, $base) + else if ($tpl/@src) + then ml:safe-resolve-uri($tpl/@src, $base) + else if ($tpl/@typeof) + then ml:generate-bnode-id($tpl, "typeof") + else if ($tpl/(@rel | @rev)) + then () + else if ($tpl/@resource) + then ml:safe-resolve-uri-or-curie($tpl/@resource, $tpl, $base) + else if ($tpl/@href) + then ml:safe-resolve-uri($tpl/@href, $base) + else ml:generate-bnode-id($tpl) + let $effective-sbj := if ($relorrev eq "rel") then $locsbj else $locobj + let $effective-obj := if ($relorrev eq "rel") then $locobj else $locsbj + where ml:curie-is-valid($relv, $node) + return + if ($locsbj and $locobj) + then + + { + if (starts-with($effective-sbj, "_")) + then attribute rdf:nodeID { $effective-sbj } + else attribute rdf:about { $effective-sbj }, + + element { ml:curie-to-qname($relv, $node) } + { + if (starts-with($effective-obj, "_")) + then attribute rdf:nodeID { $effective-obj } + else attribute rdf:resource { $effective-obj } + } + } + + else () +}; + +(: this is to generate the 1 (or none) bnode reference that all @rel/@rev/@property completers share :) +declare function ml:relrev-hanging-bnode($node as node(), $val as xs:string, $relorrev, $base as xs:string) as element()* { + for $relv in if (normalize-space($val) eq "") then () else tokenize($val, "\s+") + let $locsbj := ml:subject($node, $base) + let $locobj := ml:generate-bnode-id($node) + let $effective-sbj := if ($relorrev eq "rel") then $locsbj else $locobj + let $effective-obj := if ($relorrev eq "rel") then $locobj else $locsbj + where ml:curie-is-valid($relv, $node) + return + if (ml:hanging-bnode($node)) + then + { + if (starts-with($effective-sbj, "_")) + then attribute rdf:nodeID { $effective-sbj } + else attribute rdf:about { $effective-sbj }, + + element { ml:curie-to-qname($relv, $node) } + { + if (starts-with($effective-obj, "_")) + then attribute rdf:nodeID { $effective-obj } + else attribute rdf:resource { $effective-obj } + } + } + + else () +}; + +declare function ml:hanging-descendants($node as node()) as node()* { + (: find all descendant nodes with hanging-triple-completing-via-new-node attributes... :) + $node//*[@src or @about or @typeof or @href or @resource] + [count(($node//* intersect ./ancestor::*)/(@src | @about | @typeof | @href | @resource)) eq 0] + (: but exclude stuff we've already seen, and stuff more than one level deep + (the deeper stuff is "yet to be seen") :) +}; + +declare function ml:hanging-bnode($node as node()) as node()* { + (: find all descendant nodes with hanging-triple-completing-via-the-same-bnode attributes... :) + $node//*[@rel or @rev or @property] + [count(($node//* intersect ./ancestor::*)/(@rel | @rev | @property)) eq 0] + (: but exclude stuff we've already seen, and stuff more than one level deep + (the deeper stuff is "yet to be seen") :) +}; + +declare function ml:typeof($node as node(), $val as xs:string, $base as xs:string) as element()* { + for $type in if (normalize-space($val) eq "") then () else tokenize($val, "\s+") + let $locsbj := if ($node/@about) + then ml:safe-resolve-uri-or-curie($node/@about, $node, $base) + else if ($node/@src) + then ml:safe-resolve-uri($node/@src, $base) + else if (local-name($node) = ("head", "body")) + then $base + else if ($node/@resource and not($node/(@rel | @rev))) + then ml:safe-resolve-uri-or-curie($node/@resource, $node, $base) + else if ($node/@href and not($node/(@rel | @rev))) + then ml:safe-resolve-uri($node/@href, $base) + else ml:generate-bnode-id($node, "typeof") + let $rsc := ml:expand-curie($type, $node) + return + if ($locsbj and $rsc) + then + + { + if (starts-with($locsbj, "_")) + then attribute rdf:nodeID { $locsbj } + else attribute rdf:about { $locsbj } + } + + + else () +}; + +(: compensate for the lack of generate-id function (XSLT has it good here!) :) +declare function ml:generate-id($node as element()) as xs:string { + concat("node", count($node/preceding::*), local-name($node), count($node/ancestor::*) ) +}; + +declare function ml:generate-bnode-id($node as element()) as xs:string { + ml:generate-bnode-id($node, "") +}; + +declare function ml:generate-bnode-id($node as element(), $extra as xs:string) as xs:string { + concat("_:b", ml:generate-id($node), $extra) +}; + +(: curie parts: 1:prefix, 2:suffix, 3:uri 3 will be missing for invalid CURIEs :) +declare function ml:curie-parse($curie as xs:string, $context as element()) as xs:string* { + let $prefix := substring-before($curie, ":") + let $nsuri := if ($prefix eq "") then $dfvocab else namespace-uri-for-prefix($prefix, $context) + let $suffix := if ($nsuri eq $dfvocab) + then if (starts-with($curie, ":")) + then substring-after($curie, ":") + else $curie + else substring-after($curie, ":") + return ($prefix, $suffix, $nsuri) +}; + +declare function ml:curie-is-valid($curie as xs:string, $context as element()) as xs:boolean { + let $parts := ml:curie-parse($curie, $context) + return ($parts[1] eq "_") or ($parts[3] eq $dfvocab and $parts[2] = $htmlrels) or ($parts[1] ne "" and $parts[3] ne "") +}; + +declare function ml:expand-curie($curie as xs:string, $context as element()) as xs:string { + let $parts := ml:curie-parse($curie, $context) + return if ($parts[1] eq "_") then $curie else concat($parts[3], $parts[2]) +}; + +(: RDF serialization requires a qname, which might not match exactly with a CURIE :) +declare function ml:curie-to-qname($curie as xs:string, $context as element()) as xs:QName? { + let $expanded := ml:expand-curie($curie, $context) + let $elem-part := replace ($expanded , '^.*[#|/]','') + let $ns-part := replace($expanded, concat('^(.*)', $elem-part,'.*'),'$1') + return fn:QName($ns-part, $elem-part) +}; + +(: there is some spec ambiguity on how fn:resolve-uri() should behave with a +zero-length input. We sidestep it by handling it explicitly here :) +declare function ml:safe-resolve-uri($rel as xs:string, $base as xs:string) as xs:string { + if ($rel eq "") + then $base + else if (starts-with($rel, "#")) + then concat($base, $rel) + else resolve-uri($rel, $base) +}; + +declare function ml:safe-resolve-uri-or-curie($val as xs:string, $context as element(), $base as xs:string) as xs:string? { + if (starts-with($val, "[") and ends-with($val, "]")) + then let $curie := substring-after(substring-before($val, "]"), "[") + return if (ml:curie-is-valid($curie, $context)) + then ml:expand-curie($curie, $context) + else () + else ml:safe-resolve-uri($val, $base) +}; + +(: return a deep copy of the node and all children :) +declare function ml:deep-copy($node as node()) as node() { + + typeswitch($node) + case element() return + element { node-name($node) } + { + $node/@*, + for $child in $node/node() + return + if ($child instance of element()) + then ml:deep-copy($child) + else $child + } + case attribute() return $node + case text() return $node + default return $node +}; + diff --git a/rdfa/rdfa_extract.xqy b/rdfa/rdfa_extract.xqy new file mode 100644 index 0000000..30e2a03 --- /dev/null +++ b/rdfa/rdfa_extract.xqy @@ -0,0 +1,17 @@ +xquery version "1.0-ml"; + +(: Copyright 2002-2008 Mark Logic Corporation. All Rights Reserved. :) + +import module namespace rdfa = "http://marklogic.com/ns/rdfa-impl#" at "rdfa.xqy"; + +declare variable $url := xdmp:get-request-field('url'); + +let $doc := xdmp:document-get($url, + + xml + full + ) +return ( +xdmp:add-response-header("Content-type", "application/rdf+xml"), +rdfa:parse_rdfa($doc, $url) +) \ No newline at end of file