From 7f7d490c3a44ac7f54326c64cbfcd7d9cc856111 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Feb 2015 16:17:49 -0500 Subject: [PATCH 01/22] Renaming plugin to JsonExport --- ckanext/datajson/__init__.py | 2 +- ckanext/datajson/build_datajsonld.py | 8 +++---- ckanext/datajson/plugin.py | 34 ++++++++++++++-------------- setup.py | 2 +- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/ckanext/datajson/__init__.py b/ckanext/datajson/__init__.py index f20145e9..d5261f69 100644 --- a/ckanext/datajson/__init__.py +++ b/ckanext/datajson/__init__.py @@ -6,6 +6,6 @@ import pkgutil __path__ = pkgutil.extend_path(__path__, __name__) -from plugin import DataJsonPlugin +from plugin import JsonExportPlugin from harvester_datajson import DataJsonHarvester from harvester_cmsdatanavigator import CmsDataNavigatorHarvester diff --git a/ckanext/datajson/build_datajsonld.py b/ckanext/datajson/build_datajsonld.py index 9f0c073d..fb88f6dc 100644 --- a/ckanext/datajson/build_datajsonld.py +++ b/ckanext/datajson/build_datajsonld.py @@ -4,10 +4,10 @@ from sqlalchemy.util import OrderedDict def dataset_to_jsonld(dataset): - from plugin import DataJsonPlugin + from plugin import JsonExportPlugin ret = OrderedDict([ - ("@id", DataJsonPlugin.site_url + "/dataset/" + dataset["identifier"]), + ("@id", JsonExportPlugin.site_url + "/dataset/" + dataset["identifier"]), ("@type", "dcat:Dataset"), ]) @@ -20,9 +20,9 @@ def dataset_to_jsonld(dataset): return ret def distribution_to_jsonld(distribution): - from plugin import DataJsonPlugin + from plugin import JsonExportPlugin ret = OrderedDict([ - ("@id", DataJsonPlugin.site_url + "/resource/" + distribution["identifier"]), + ("@id", JsonExportPlugin.site_url + "/resource/" + distribution["identifier"]), ("@type", "dcat:Distribution"), ]) apply_jsonld_metadata_mapping(distribution, ret) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 99d5d403..c755c377 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -38,7 +38,7 @@ def get_validator(): from build_datajsonld import dataset_to_jsonld -class DataJsonPlugin(p.SingletonPlugin): +class JsonExportPlugin(p.SingletonPlugin): p.implements(p.interfaces.IConfigurer) p.implements(p.interfaces.IRoutes, inherit=True) @@ -48,14 +48,14 @@ def update_config(self, config): # to know how to set the paths. # TODO commenting out enterprise data inventory for right now - # DataJsonPlugin.route_edata_path = config.get("ckanext.enterprisedatajson.path", "/enterprisedata.json") - DataJsonPlugin.route_enabled = config.get("ckanext.datajson.url_enabled", "True") == 'True' - DataJsonPlugin.route_path = config.get("ckanext.datajson.path", "/data.json") - DataJsonPlugin.route_ld_path = config.get("ckanext.datajsonld.path", - re.sub(r"\.json$", ".jsonld", DataJsonPlugin.route_path)) - DataJsonPlugin.ld_id = config.get("ckanext.datajsonld.id", config.get("ckan.site_url")) - DataJsonPlugin.ld_title = config.get("ckan.site_title", "Catalog") - DataJsonPlugin.site_url = config.get("ckan.site_url") + # JsonExportPlugin.route_edata_path = config.get("ckanext.enterprisedatajson.path", "/enterprisedata.json") + JsonExportPlugin.route_enabled = config.get("ckanext.datajson.url_enabled", "True") == 'True' + JsonExportPlugin.route_path = config.get("ckanext.datajson.path", "/data.json") + JsonExportPlugin.route_ld_path = config.get("ckanext.datajsonld.path", + re.sub(r"\.json$", ".jsonld", JsonExportPlugin.route_path)) + JsonExportPlugin.ld_id = config.get("ckanext.datajsonld.id", config.get("ckan.site_url")) + JsonExportPlugin.ld_title = config.get("ckan.site_title", "Catalog") + JsonExportPlugin.site_url = config.get("ckan.site_url") # Adds our local templates directory. It's smart. It knows it's # relative to the path of *this* file. Wow. @@ -65,13 +65,13 @@ def before_map(self, m): return m def after_map(self, m): - if DataJsonPlugin.route_enabled: + if JsonExportPlugin.route_enabled: # /data.json and /data.jsonld (or other path as configured by user) - m.connect('datajson', DataJsonPlugin.route_path, controller='ckanext.datajson.plugin:DataJsonController', + m.connect('datajson', JsonExportPlugin.route_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_json') # TODO commenting out enterprise data inventory for right now - # m.connect('enterprisedatajson', DataJsonPlugin.route_edata_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_enterprise') - #m.connect('datajsonld', DataJsonPlugin.route_ld_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_jsonld') + # m.connect('enterprisedatajson', JsonExportPlugin.route_edata_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_enterprise') + #m.connect('datajsonld', JsonExportPlugin.route_ld_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_jsonld') # TODO DWC update action # /data/{org}/data.json @@ -112,11 +112,11 @@ def generate_output(self, format): ("foaf", "http://xmlns.com/foaf/0.1/"), ]) ), - ("@id", DataJsonPlugin.ld_id), + ("@id", JsonExportPlugin.ld_id), ("@type", "dcat:Catalog"), - ("dcterms:title", DataJsonPlugin.ld_title), - ("rdfs:label", DataJsonPlugin.ld_title), - ("foaf:homepage", DataJsonPlugin.site_url), + ("dcterms:title", JsonExportPlugin.ld_title), + ("rdfs:label", JsonExportPlugin.ld_title), + ("foaf:homepage", JsonExportPlugin.site_url), ("dcat:dataset", [dataset_to_jsonld(d) for d in data]), ]) diff --git a/setup.py b/setup.py index 4431576f..ae35f925 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ entry_points=\ """ [ckan.plugins] - datajson=ckanext.datajson:DataJsonPlugin + datajson=ckanext.datajson:JsonExportPlugin datajson_harvest=ckanext.datajson:DataJsonHarvester cmsdatanav_harvest=ckanext.datajson:CmsDataNavigatorHarvester """, From d4d7cbd2a62aa096e8c2f54bf12e9889b03bab9e Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Wed, 25 Feb 2015 15:34:40 -0500 Subject: [PATCH 02/22] merge schemas --- ckanext/datajson/plugin.py | 2 +- .../federal-v1.1/catalog.json | 0 .../federal-v1.1/dataset.json | 10 +- .../pod_schema/non-federal-v1.1/catalog.json | 58 ++ .../non-federal-v1.1/dataset-non-federal.json | 569 ++++++++++++++++++ .../pod_schema/non-federal/single_entry.json | 415 +++++++++++++ ckanext/datajson/pod_schema/single_entry.json | 416 +++++++++++++ .../schema/1_0_final/single_entry.json | 207 ------- 8 files changed, 1464 insertions(+), 213 deletions(-) rename ckanext/datajson/{schema => pod_schema}/federal-v1.1/catalog.json (100%) rename ckanext/datajson/{schema => pod_schema}/federal-v1.1/dataset.json (99%) create mode 100644 ckanext/datajson/pod_schema/non-federal-v1.1/catalog.json create mode 100644 ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json create mode 100644 ckanext/datajson/pod_schema/non-federal/single_entry.json create mode 100644 ckanext/datajson/pod_schema/single_entry.json delete mode 100644 ckanext/datajson/schema/1_0_final/single_entry.json diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index c755c377..c2747c99 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -16,7 +16,7 @@ def get_validator(): import os from jsonschema import Draft4Validator, FormatChecker - schema_path = os.path.join(os.path.dirname(__file__), 'schema', 'federal-v1.1', 'dataset.json') + schema_path = os.path.join(os.path.dirname(__file__), 'pod_schema', 'federal-v1.1', 'dataset.json') with open(schema_path, 'r') as file: schema = json.loads(file.read()) return Draft4Validator(schema, format_checker=FormatChecker()) diff --git a/ckanext/datajson/schema/federal-v1.1/catalog.json b/ckanext/datajson/pod_schema/federal-v1.1/catalog.json similarity index 100% rename from ckanext/datajson/schema/federal-v1.1/catalog.json rename to ckanext/datajson/pod_schema/federal-v1.1/catalog.json diff --git a/ckanext/datajson/schema/federal-v1.1/dataset.json b/ckanext/datajson/pod_schema/federal-v1.1/dataset.json similarity index 99% rename from ckanext/datajson/schema/federal-v1.1/dataset.json rename to ckanext/datajson/pod_schema/federal-v1.1/dataset.json index 8fdfce22..06fb984c 100644 --- a/ckanext/datajson/schema/federal-v1.1/dataset.json +++ b/ckanext/datajson/pod_schema/federal-v1.1/dataset.json @@ -365,7 +365,7 @@ "title": { "title": "Title", "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", - "type": "string", + "type": "string", "minLength": 1 } }, @@ -387,12 +387,12 @@ "enum": [ "vcard:Contact" ] - }, + }, "fn": { "title": "Contact Name", "description": "A full formatted name, eg Firstname Lastname", "type": "string", - "minLength": 1 + "minLength": 1 }, "hasEmail": { "title": "Email", @@ -446,7 +446,7 @@ { "type": "null" } - ] + ] }, "format": { "title": "Format", @@ -562,7 +562,7 @@ "title": "Publisher Name", "description": "A full formatted name, eg Firstname Lastname", "type": "string", - "minLength": 1 + "minLength": 1 }, "subOrganizationOf": { "title": "Parent Organization", diff --git a/ckanext/datajson/pod_schema/non-federal-v1.1/catalog.json b/ckanext/datajson/pod_schema/non-federal-v1.1/catalog.json new file mode 100644 index 00000000..95fcd75c --- /dev/null +++ b/ckanext/datajson/pod_schema/non-federal-v1.1/catalog.json @@ -0,0 +1,58 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/catalog.json#", + "title": "Project Open Data Catalog", + "description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.", + "type": "object", + "dependencies": { + "@type": [ + "@context" + ] + }, + "required": [ + "conformsTo", + "dataset" + ], + "properties": { + "@context": { + "title": "Metadata Context", + "description": "URL or JSON object for the JSON-LD Context that defines the schema used", + "type": "string", + "format": "uri" + }, + "@id": { + "title": "Metadata Catalog ID", + "description": "IRI for the JSON-LD Node Identifier of the Catalog. This should be the URL of the data.json file itself.", + "type": "string", + "format": "uri" + }, + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be dcat:Catalog for the Catalog", + "enum": [ + "dcat:Catalog" + ] + }, + "conformsTo": { + "description": "Version of Schema", + "title": "Version of Schema", + "enum": [ + "https://project-open-data.cio.gov/v1.1/schema" + ] + }, + "describedBy": { + "description": "URL for the JSON Schema file that defines the schema used", + "title": "Data Dictionary", + "type": "string", + "format": "uri" + }, + "dataset": { + "type": "array", + "items": { + "$ref": "dataset-non-federal.json", + "minItems": 1, + "uniqueItems": true + } + } + } +} \ No newline at end of file diff --git a/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json b/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json new file mode 100644 index 00000000..b0a7f846 --- /dev/null +++ b/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json @@ -0,0 +1,569 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/dataset-non-federal.json#", + "title": "Project Open Data Dataset", + "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", + "type": "object", + "required": [ + "title", + "description", + "publisher", + "contactPoint", + "identifier", + "accessLevel" + ], + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be dcat:Dataset for each Dataset", + "enum": [ + "dcat:Dataset" + ] + }, + "accessLevel": { + "description": "The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", + "title": "Public Access Level", + "enum": [ + "public", + "restricted public", + "non-public" + ] + }, + "rights": { + "title": "Rights", + "description": "This may include information regarding access or restrictions based on privacy, security, or other policies. This should also provide an explanation for the selected \"accessLevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength": 255 + }, + { + "type": "null" + } + ] + }, + "accrualPeriodicity": { + "title": "Frequency", + "description": "Frequency with which dataset is published.", + "anyOf": [ + { + "enum": [ + "irregular" + ] + }, + { + "type": "string", + "pattern": "^R\\/P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "null" + } + ] + }, + "bureauCode": { + "title": "Bureau Code", + "description": "Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + "contactPoint": { + "$ref": "#/definitions/vcard-non-federal" + }, + "describedBy": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "describedByType": { + "title": "Data Dictionary Type", + "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s describedBy URL", + "anyOf": [ + { + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + }, + { + "type": "null" + } + ] + }, + "conformsTo": { + "title": "Data Standard", + "description": "URI used to identify a standardized specification the dataset conforms to", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "dataQuality": { + "title": "Data Quality", + "description": "Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", + "type": "string", + "minLength": 1 + }, + "distribution": { + "title": "Distribution", + "description": "A container for the array of Distribution objects", + "anyOf": [ + { + "type": "array", + "items": { + "$ref": "distribution.json", + "minItems": 1, + "uniqueItems": true + } + }, + { + "type": "null" + } + ] + }, + "identifier": { + "title": "Unique Identifier", + "description": "A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "type": "string", + "minLength": 1 + }, + "issued": { + "title": "Release Date", + "description": "Date of formal issuance.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "null" + } + ] + }, + "keyword": { + "title": "Tags", + "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + "landingPage": { + "title": "Homepage URL", + "description": "Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "language": { + "title": "Language", + "description": "The language of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" + } + }, + { + "type": "null" + } + ] + }, + "license": { + "title": "License", + "description": "The license dataset or API is published with. See Open Licenses for more information.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "modified": { + "title": "Last Update", + "description": "Most recent date on which the dataset was changed, updated or modified.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^(R\\d*\\/)?P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "string", + "pattern": "^(R\\d*\\/)?([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + } + ] + }, + "primaryITInvestmentUII": { + "title": "Primary IT Investment UII", + "description": "For linking a dataset with an IT Unique Investment Identifier (UII)", + "anyOf": [ + { + "type": "string", + "pattern": "[0-9]{3}-[0-9]{9}" + }, + { + "type": "null" + } + ] + }, + "programCode": { + "title": "Program Code", + "description": "Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{3}" + }, + "minItems": 1, + "uniqueItems": true + }, + "publisher": { + "$ref": "organization.json" + }, + "references": { + "title": "Related Documents", + "description": "Related documents such as technical information about a dataset, developer documentation, etc.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "spatial": { + "title": "Spatial", + "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "systemOfRecords": { + "title": "System of Records", + "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "temporal": { + "title": "Temporal", + "description": "The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^(R\\d*\\/)?([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "string", + "pattern": "^(R\\d*\\/)?P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "null" + } + ] + }, + "isPartOf": { + "title": "Collection", + "description": "The collection of which the dataset is a subset", + "anyOf": [ + { + "type": "string", + "minLength": 1 + } + ] + }, + "theme": { + "title": "Category", + "description": "Main thematic category of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", + "type": "string", + "minLength": 1 + } + }, + "definitions": { + "vcard-non-federal": { + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/vcard-non-federal.json#", + "title": "Project Open Data ContactPoint vCard", + "description": "A Dataset ContactPoint as a vCard object", + "type": "object", + "required": [ + "fn" + ], + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be vcard:Contact for contactPoint", + "enum": [ + "vcard:Contact" + ] + }, + "fn": { + "title": "Contact Name", + "description": "A full formatted name, eg Firstname Lastname", + "type": "string", + "minLength": 1 + }, + "hasEmail": { + "title": "Email", + "description": "Email address for the contact", + "pattern": "^mailto:([\\w.-]+@[\\w.-]+\\.[\\w.-]+)?$", + "type": "string" + } + } + }, + "distribution": { + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/distribution.json#", + "title": "Project Open Data Distribution", + "description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.", + "type": "object", + "dependencies": { + "downloadURL": { + "properties": { + "mediaType": { + "type": "string", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$" + } + }, + "required": [ + "mediaType" + ] + } + }, + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be dcat:Distribution for each Distribution", + "enum": [ + "dcat:Distribution" + ] + }, + "downloadURL": { + "title": "Download URL", + "description": "URL providing direct access to a downloadable file of a dataset", + "type": "string", + "format": "uri" + }, + "mediaType": { + "title": "Media Type", + "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s downloadURL", + "anyOf": [ + { + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + }, + { + "type": "null" + } + ] + }, + "format": { + "title": "Format", + "description": "A human-readable description of the file format of a distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "accessURL": { + "title": "Access URL", + "description": "URL providing indirect access to a dataset", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description of the distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "conformsTo": { + "title": "Data Standard", + "description": "URL providing indirect access to a dataset", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "describedBy": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the distribution found at the downloadURL", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "describedByType": { + "title": "Data Dictionary Type", + "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s describedBy URL", + "anyOf": [ + { + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + }, + { + "type": "null" + } + ] + } + } + }, + "organization": { + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/organization.json#", + "title": "Project Open Data Organization", + "description": "A Dataset Publisher Organization as a foaf:Agent object", + "type": "object", + "required": [ + "name" + ], + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be org:Organization for each publisher", + "enum": [ + "org:Organization" + ] + }, + "name": { + "title": "Publisher Name", + "description": "A full formatted name, eg Firstname Lastname", + "type": "string", + "minLength": 1 + }, + "subOrganizationOf": { + "title": "Parent Organization", + "$ref": "organization.json" + } + } + } + } +} \ No newline at end of file diff --git a/ckanext/datajson/pod_schema/non-federal/single_entry.json b/ckanext/datajson/pod_schema/non-federal/single_entry.json new file mode 100644 index 00000000..4ab4b311 --- /dev/null +++ b/ckanext/datajson/pod_schema/non-federal/single_entry.json @@ -0,0 +1,415 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", + "title": "Common Core Metadata Schema", + "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", + "type": "object", + "required": ["title", "description", "license", "publisher", "contactPoint", "identifier", "accessLevel"], + "properties": { + "accessLevel": { + "description":"The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", + "title": "Public Access Level", + "enum": ["public", "restricted public", "non-public"] + }, + "accessLevelComment": { + "title":"Access Level Comment", + "description":"An explanation for the selected \"accessLevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength":255 + }, + { + "type": "null" + } + ] + }, + "accrualPeriodicity": { + "title":"Frequency", + "description":"Frequency with which dataset is published.", + "anyOf": [ + { + "enum": ["Annual", "Bimonthly", "Semiweekly", "Daily", "Biweekly", "Semiannual", "Biennial", "Triennial", + "Three times a week", "Three times a month", "Continuously updated", "Monthly", "Quarterly", "Semimonthly", + "Three times a year", "Weekly", "Completely irregular"] + }, + { + "type": "null" + } + ] + }, + "bureauCode": { + "title":"Bureau Code", + "description":"Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "contactPoint": { + "title":"Contact Name", + "description":"Contact person’s name for the asset.", + "type": "string" + }, + "dataDictionary": { + "title":"Data Dictionary", + "description":"URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "dataQuality": { + "title":"Data Quality", + "description":"Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "description": { + "title" : "Description", + "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", + "type": "string" + }, + "distribution": { + "title":"Distribution", + "description":"Holds multiple download URLs for datasets composed of multiple files and/or file types", + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "required": ["accessURL", "format"], + "properties": { + "accessURL": { + "title":"Download URL", + "description":"URL providing direct access to the downloadable distribution of a dataset.", + "type": "string", + "format": "uri" + }, + "format": { + "title":"Format", + "description":"The file format or API type of the distribution.", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "identifier": { + "title":"Unique Identifier", + "description":"A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "type": "string", + "pattern": "[\\w]+" + }, + "issued": { + "title":"Release Date", + "description":"Date of formal issuance.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "keyword": { + "title": "Tags", + "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + { + "type": "null" + } + ] + + }, + "landingPage": { + "title":"Homepage URL", + "description":"Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "language": { + "title":"Language", + "description":"The language of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" + } + }, + { + "type": "null" + } + ] + }, + "license": { + "title":"License", + "description":"The license dataset or API is published with. See Open Licenses for more information.", + "type": "string", + "minLength": 1 + }, + "mbox": { + "title":"Contact Email", + "description":"Contact person’s email address.", + "anyOf": [ + { + "type": "string", + "format": "email" + }, + { + "type": "null" + }, + { + "type": "string" + } + ] + }, + "modified": { + "title": "Last Update", + "description": "Most recent date on which the dataset was changed, updated or modified.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + } + ] + }, + "PrimaryITInvestmentUII": { + "title":"Primary IT Investment UII", + "description":"For linking a dataset with an IT Unique Investment Identifier (UII)", + "anyOf": [ + { + "type": "string", + "pattern": "[0-9]{3}-[0-9]{9}" + }, + { + "type": "null" + } + ] + }, + "programCode": { + "title":"Program Code", + "description":"Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{3}" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "publisher": { + "title":"Publisher", + "description": "The publishing entity.", + "type": "string" + }, + "references": { + "title":"Related Documents", + "description":"Related documents such as technical information about a dataset, developer documentation, etc.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "spatial": { + "title":"Spatial", + "description":"The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "systemOfRecords": { + "title":"System of Records", + "description":"If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "temporal": { + "title":"Temporal", + "description":"The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "theme": { + "title":"Category", + "description":"Main thematic category of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", + "type": "string" + }, + "webService": { + "title":"Endpoint", + "description":"Endpoint of web service to access dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + } + } +} diff --git a/ckanext/datajson/pod_schema/single_entry.json b/ckanext/datajson/pod_schema/single_entry.json new file mode 100644 index 00000000..52dcda77 --- /dev/null +++ b/ckanext/datajson/pod_schema/single_entry.json @@ -0,0 +1,416 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", + "title": "Common Core Metadata Schema", + "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", + "type": "object", + "required": ["bureaucode", "programcode", "title", "description", "keyword", "modified", "publisher", "contactpoint", "mbox", "identifier", "accesslevel"], + "properties": { + "accesslevel": { + "description":"The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", + "title": "Public Access Level", + "enum": ["public", "restricted public", "non-public"] + }, + "accesslevelcomment": { + "title":"Access Level Comment", + "description":"An explanation for the selected \"accesslevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength":255 + }, + { + "type": "null" + } + ] + }, + "accessurl": { + "title":"Download URL", + "description":"URL providing direct access to the downloadable distribution of a dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "accrualperiodicity": { + "title":"Frequency", + "description":"Frequency with which dataset is published.", + "anyOf": [ + { + "enum": ["Annual", "Bimonthly", "Semiweekly", "Daily", "Biweekly", "Semiannual", "Biennial", "Triennial", + "Three times a week", "Three times a month", "Continuously updated", "Monthly", "Quarterly", "Semimonthly", + "Three times a year", "Weekly", "Completely irregular"] + }, + { + "type": "null" + } + ] + }, + "bureaucode": { + "title":"Bureau Code", + "description":"Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + "contactpoint": { + "title":"Contact Name", + "description":"Contact person’s name for the asset.", + "type": "string" + }, + "datadictionary": { + "title":"Data Dictionary", + "description":"URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "dataquality": { + "title":"Data Quality", + "description":"Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "description": { + "title" : "Description", + "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", + "type": "string" + }, + "distribution": { + "title":"Distribution", + "description":"Holds multiple download URLs for datasets composed of multiple files and/or file types", + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "required": ["accessurl", "format"], + "properties": { + "accessurl": { + "title":"Download URL", + "description":"URL providing direct access to the downloadable distribution of a dataset.", + "type": "string", + "format": "uri" + }, + "format": { + "title":"Format", + "description":"The file format or API type of the distribution.", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "format": { + "title":"Format", + "description":"The file format or API type of the distribution.", + "anyOf": [ + { + "type": "string", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$" + }, + { + "type": "null" + } + ] + }, + "identifier": { + "title":"Unique Identifier", + "description":"A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "type": "string", + "pattern": "[\\w]+" + }, + "issued": { + "title":"Release Date", + "description":"Date of formal issuance.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "keyword": { + "title": "Tags", + "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + "landingpage": { + "title":"Homepage URL", + "description":"Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "language": { + "title":"Language", + "description":"The language of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" + } + }, + { + "type": "null" + } + ] + }, + "license": { + "title":"License", + "description":"The license dataset or API is published with. See Open Licenses for more information.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "mbox": { + "title":"Contact Email", + "description":"Contact person’s email address.", + "type": "string", + "format": "email" + }, + "modified": { + "title": "Last Update", + "description": "Most recent date on which the dataset was changed, updated or modified.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + } + ] + }, + "primaryitinvestmentuii": { + "title":"Primary IT Investment UII", + "description":"For linking a dataset with an IT Unique Investment Identifier (UII)", + "anyOf": [ + { + "type": "string", + "pattern": "[0-9]{3}-[0-9]{9}" + }, + { + "type": "null" + } + ] + }, + "programcode": { + "title":"Program Code", + "description":"Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{3}" + }, + "minItems": 1, + "uniqueItems": true + }, + "publisher": { + "title":"Publisher", + "description": "The publishing entity.", + "type": "string" + }, + "references": { + "title":"Related Documents", + "description":"Related documents such as technical information about a dataset, developer documentation, etc.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "spatial": { + "title":"Spatial", + "description":"The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "systemofrecords": { + "title":"System of Records", + "description":"If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "temporal": { + "title":"Temporal", + "description":"The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "theme": { + "title":"Category", + "description":"Main thematic category of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", + "type": "string" + }, + "webservice": { + "title":"Endpoint", + "description":"Endpoint of web service to access dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + } + } +} diff --git a/ckanext/datajson/schema/1_0_final/single_entry.json b/ckanext/datajson/schema/1_0_final/single_entry.json deleted file mode 100644 index 4567f43c..00000000 --- a/ckanext/datajson/schema/1_0_final/single_entry.json +++ /dev/null @@ -1,207 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-04/schema#", - "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", - "title": "Common Core Metadata Schema", - "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", - "type": "object", - "required": ["title", "description", "keyword", "modified", "publisher", "contactPoint", "mbox", "identifier", "accessLevel"], - "properties": { - "accessLevel": { - "description":"The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", - "title": "Public Access Level", - "enum": ["public", "restricted public", "non-public"] - }, - "accessLevelComment": { - "title":"Access Level Comment", - "description":"An explanation for the selected \"accessLevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", - "type": "string", - "maxLength":255 - }, - "accrualPeriodicity": { - "title":"Frequency", - "description":"Frequency with which dataset is published.", - "enum": ["Annual", "Bimonthly", "Semiweekly", "Daily", "Biweekly", "Semiannual", "Biennial", "Triennial", - "Three times a week", "Three times a month", "Continuously updated", "Monthly", "Quarterly", "Semimonthly", - "Three times a year", "Weekly", "Completely irregular"] - }, - "bureauCode": { - "title":"Bureau Code", - "description":"Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{2}" - }, - "minItems": 1, - "uniqueItems": true - }, - "contactPoint": { - "title":"Contact Name", - "description":"Contact person’s name for the asset.", - "type": "string" - }, - "dataDictionary": { - "title":"Data Dictionary", - "description":"URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", - "type": "string", - "format": "uri" - }, - "dataQuality": { - "title":"Data Quality", - "description":"Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", - "type": "boolean" - }, - "description": { - "title" : "Description", - "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", - "type": "string" - }, - "distribution": { - "title":"Distribution", - "description":"Holds multiple download URLs for datasets composed of multiple files and/or file types", - "type": "array", - "items": { - "type": "object", - "properties": { - "accessURL": { - "title":"Download URL", - "description":"URL providing direct access to the downloadable distribution of a dataset.", - "type": "string", - "format": "uri" - }, - "format": { - "title":"Format", - "description":"The file format or API type of the distribution.", - "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", - "type": "string" - } - } - }, - "minItems": 1, - "uniqueItems": true - }, - "identifier": { - "title":"Unique Identifier", - "description":"A unique identifier for the dataset or API as maintained within an Agency catalog or database.", - "type": "string", - "pattern": "[\\w]+" - }, - "issued": { - "title":"Release Date", - "description":"Date of formal issuance.", - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - "keyword": { - "title": "Tags", - "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", - "type": "array", - "items": { - "type": "string" - }, - "minItems": 1, - "uniqueItems": true - }, - "landingPage": { - "title":"Homepage URL", - "description":"Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", - "type": "string", - "format": "uri" - }, - "language": { - "title":"Language", - "description":"The language of the dataset.", - "type": "array", - "items": { - "type": "string", - "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" - } - }, - "license": { - "title":"License", - "description":"The license dataset or API is published with. See Open Licenses for more information.", - "type": "string" - }, - "mbox": { - "title":"Contact Email", - "description":"Contact person’s email address.", - "type": "string", - "format": "email" - }, - "modified": { - "title": "Last Update", - "description": "Most recent date on which the dataset was changed, updated or modified.", - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - "PrimaryITInvestmentUII": { - "title":"Primary IT Investment UII", - "description":"For linking a dataset with an IT Unique Investment Identifier (UII)", - "type": "string" - }, - "programCode": { - "title":"Program Code", - "description":"Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{3}" - }, - "minItems": 1, - "uniqueItems": true - }, - "publisher": { - "title":"Publisher", - "description": "The publishing entity.", - "type": "string" - }, - "references": { - "title":"Related Documents", - "description":"Related documents such as technical information about a dataset, developer documentation, etc.", - "type": "array", - "items": { - "type": "string", - "format": "uri" - }, - "minItems": 1, - "uniqueItems": true - }, - "spatial": { - "title":"Spatial", - "description":"The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", - "type": "string" - }, - "systemOfRecords": { - "title":"System of Records", - "description":"If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", - "type": "string" - }, - "temporal": { - "title":"Temporal", - "description":"The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - "theme": { - "title":"Category", - "description":"Main thematic category of the dataset.", - "type": "array", - "items": { - "type": "string" - }, - "minItems": 1, - "uniqueItems": true - }, - "title": { - "title": "Title", - "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", - "type": "string" - }, - "webService": { - "title":"Endpoint", - "description":"Endpoint of web service to access dataset.", - "type": "string", - "format": "uri" - } - } -} From 7d6f7acf8afb3256dbda140406d63b51ccafa1f4 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Wed, 25 Feb 2015 16:10:32 -0500 Subject: [PATCH 03/22] renaming inventory datajson plugin to datajson_export --- ckanext/datajson/plugin.py | 43 +++++++++++++++++++++++--------------- setup.py | 3 ++- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index c2747c99..0daae0a9 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -1,13 +1,14 @@ -import ckan.plugins as p +import json +import logging +import StringIO +import ckan.plugins as p from ckan.lib.base import BaseController, render, c import ckan.model as model from pylons import request, response import ckan.lib.dictization.model_dictize as model_dictize -import json, re -import logging +import re from jsonschema.exceptions import best_match -import StringIO logger = logging.getLogger('datajson') @@ -38,6 +39,11 @@ def get_validator(): from build_datajsonld import dataset_to_jsonld +class DataJsonPlugin(p.SingletonPlugin): + p.implements(p.interfaces.IConfigurer) + p.implements(p.interfaces.IRoutes, inherit=True) + + class JsonExportPlugin(p.SingletonPlugin): p.implements(p.interfaces.IConfigurer) p.implements(p.interfaces.IRoutes, inherit=True) @@ -52,7 +58,7 @@ def update_config(self, config): JsonExportPlugin.route_enabled = config.get("ckanext.datajson.url_enabled", "True") == 'True' JsonExportPlugin.route_path = config.get("ckanext.datajson.path", "/data.json") JsonExportPlugin.route_ld_path = config.get("ckanext.datajsonld.path", - re.sub(r"\.json$", ".jsonld", JsonExportPlugin.route_path)) + re.sub(r"\.json$", ".jsonld", JsonExportPlugin.route_path)) JsonExportPlugin.ld_id = config.get("ckanext.datajsonld.id", config.get("ckan.site_url")) JsonExportPlugin.ld_title = config.get("ckan.site_title", "Catalog") JsonExportPlugin.site_url = config.get("ckan.site_url") @@ -71,7 +77,7 @@ def after_map(self, m): action='generate_json') # TODO commenting out enterprise data inventory for right now # m.connect('enterprisedatajson', JsonExportPlugin.route_edata_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_enterprise') - #m.connect('datajsonld', JsonExportPlugin.route_ld_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_jsonld') + # m.connect('datajsonld', JsonExportPlugin.route_ld_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_jsonld') # TODO DWC update action # /data/{org}/data.json @@ -147,7 +153,8 @@ def validator(self): e) + ". Try using JSONLint.com."])) except Exception as e: c.errors.append(( - "Internal Error", ["Something bad happened while trying to load and parse the file: " + unicode(e)])) + "Internal Error", + ["Something bad happened while trying to load and parse the file: " + unicode(e)])) if body: try: @@ -163,8 +170,8 @@ def generate_pdl(self): # DWC this is a hack, as I couldn't get to the request parameters. For whatever reason, the multidict was always empty match = re.match(r"/organization/([-a-z0-9]+)/data.json", request.path) - #If user is not editor or admin of the organization then don't allow pdl download - if p.toolkit.check_access('package_create', {'model': model,'user':c.user}, {'owner_org': match.group(1)}): + # If user is not editor or admin of the organization then don't allow pdl download + if p.toolkit.check_access('package_create', {'model': model, 'user': c.user}, {'owner_org': match.group(1)}): if match: # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -179,8 +186,8 @@ def generate_edi(self): # DWC this is a hack, as I couldn't get to the request parameters. For whatever reason, the multidict was always empty match = re.match(r"/organization/([-a-z0-9]+)/edi.json", request.path) - #If user is not editor or admin of the organization then don't allow edi download - if p.toolkit.check_access('package_create', {'model': model,'user':c.user}, {'owner_org': match.group(1)}): + # If user is not editor or admin of the organization then don't allow edi download + if p.toolkit.check_access('package_create', {'model': model, 'user': c.user}, {'owner_org': match.group(1)}): if match: # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -227,7 +234,7 @@ def make_edi(owner_org): output = [] for pkg in packages: - #if pkg['owner_org'] == owner_org: + # if pkg['owner_org'] == owner_org: datajson_entry = make_datajson_entry(pkg) if datajson_entry and is_valid(datajson_entry): output.append(datajson_entry) @@ -241,7 +248,7 @@ def make_edi(owner_org): logger.removeHandler(eh) stream.close() - #return json.dumps(output) + # return json.dumps(output) return write_zip(output, error, zip_name='edi') @@ -258,7 +265,7 @@ def make_pdl(owner_org): packages = get_packages(owner_org) output = [] - #Create data.json only using public datasets, datasets marked non-public are not exposed + # Create data.json only using public datasets, datasets marked non-public are not exposed for pkg in packages: extras = dict([(x['key'], x['value']) for x in pkg['extras']]) try: @@ -281,13 +288,14 @@ def make_pdl(owner_org): logger.removeHandler(eh) stream.close() - #return json.dumps(output) + # return json.dumps(output) return write_zip(output, error, zip_name='pdl') + def get_packages(owner_org): # Build the data.json file. packages = get_all_group_packages(group_id=owner_org) - #get packages for sub-agencies. + # get packages for sub-agencies. sub_agency = model.Group.get(owner_org) if 'sub-agencies' in sub_agency.extras.col.target and \ sub_agency.extras.col.target['sub-agencies'].state == 'active': @@ -300,6 +308,7 @@ def get_packages(owner_org): return packages + def get_all_group_packages(group_id): """ Gets all of the group packages, public or private, returning them as a list of CKAN's dictized packages. @@ -337,7 +346,7 @@ def write_zip(data, error=None, zip_name='data'): if data: zf.writestr('data.json', json.dumps(make_datajson_catalog(data), ensure_ascii=False).encode('utf8')) - #Write the error log + # Write the error log if error: zf.writestr('errorlog.txt', error.encode('utf8')) diff --git a/setup.py b/setup.py index ae35f925..09c5f542 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,8 @@ entry_points=\ """ [ckan.plugins] - datajson=ckanext.datajson:JsonExportPlugin + datajson=ckanext.datajson:DataJsonPlugin + datajson_export=ckanext.datajson:JsonExportPlugin datajson_harvest=ckanext.datajson:DataJsonHarvester cmsdatanav_harvest=ckanext.datajson:CmsDataNavigatorHarvester """, From 706e61722c337763fdc8ef907eded56c86f71550 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Wed, 25 Feb 2015 16:50:50 -0500 Subject: [PATCH 04/22] revert requirements renamings --- ckanext/datajson/__init__.py | 1 + requirements.txt => pip-requirements.txt | 1 + 2 files changed, 2 insertions(+) rename requirements.txt => pip-requirements.txt (76%) diff --git a/ckanext/datajson/__init__.py b/ckanext/datajson/__init__.py index d5261f69..b68480c0 100644 --- a/ckanext/datajson/__init__.py +++ b/ckanext/datajson/__init__.py @@ -7,5 +7,6 @@ __path__ = pkgutil.extend_path(__path__, __name__) from plugin import JsonExportPlugin +from plugin import DataJsonPlugin from harvester_datajson import DataJsonHarvester from harvester_cmsdatanavigator import CmsDataNavigatorHarvester diff --git a/requirements.txt b/pip-requirements.txt similarity index 76% rename from requirements.txt rename to pip-requirements.txt index 441b63d5..4f5e07df 100644 --- a/requirements.txt +++ b/pip-requirements.txt @@ -1,3 +1,4 @@ pyyaml lepl jsonschema +rfc3987 \ No newline at end of file From 737dcfac0e46bc6f75652840ff77eb6aac931e07 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Wed, 25 Feb 2015 17:20:56 -0500 Subject: [PATCH 05/22] get closer to catalog branch --- ckanext/datajson/build_datajson.py | 4 +- ckanext/datajson/plugin.py | 386 +++++++++++++++-------------- 2 files changed, 197 insertions(+), 193 deletions(-) diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index de7be113..17b1c8f5 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -13,7 +13,7 @@ # TODO this file is pretty sloppy, needs cleanup and redundancies removed -def make_datajson_catalog(datasets): +def make_datajson_export_catalog(datasets): catalog = OrderedDict([ ('conformsTo', 'https://project-open-data.cio.gov/v1.1/schema'), # requred ('describedBy', 'https://project-open-data.cio.gov/v1.1/schema/catalog.json'), # optional @@ -24,7 +24,7 @@ def make_datajson_catalog(datasets): return catalog -def make_datajson_entry(package): +def make_datajson_export_entry(package): # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict extras = dict([(x['key'], x['value']) for x in package['extras']]) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 0daae0a9..6997079c 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -1,39 +1,24 @@ -import json import logging import StringIO +import json import ckan.plugins as p from ckan.lib.base import BaseController, render, c -import ckan.model as model from pylons import request, response -import ckan.lib.dictization.model_dictize as model_dictize import re +import ckan.model as model +import ckan.lib.dictization.model_dictize as model_dictize from jsonschema.exceptions import best_match -logger = logging.getLogger('datajson') - - -def get_validator(): - import os - from jsonschema import Draft4Validator, FormatChecker - - schema_path = os.path.join(os.path.dirname(__file__), 'pod_schema', 'federal-v1.1', 'dataset.json') - with open(schema_path, 'r') as file: - schema = json.loads(file.read()) - return Draft4Validator(schema, format_checker=FormatChecker()) - - logger.warn('Unable to create validator') - return None - -validator = get_validator() +logger = logging.getLogger('datajson') try: from collections import OrderedDict # 2.7 except ImportError: from sqlalchemy.util import OrderedDict -from build_datajson import make_datajson_entry, make_datajson_catalog +from build_datajson import make_datajson_export_entry, make_datajson_export_catalog # from build_enterprisedatajson import make_enterprisedatajson_entry from build_datajsonld import dataset_to_jsonld @@ -73,29 +58,30 @@ def before_map(self, m): def after_map(self, m): if JsonExportPlugin.route_enabled: # /data.json and /data.jsonld (or other path as configured by user) - m.connect('datajson', JsonExportPlugin.route_path, controller='ckanext.datajson.plugin:DataJsonController', + m.connect('datajson_export', JsonExportPlugin.route_path, + controller='ckanext.datajson.plugin:JsonExportController', action='generate_json') # TODO commenting out enterprise data inventory for right now - # m.connect('enterprisedatajson', JsonExportPlugin.route_edata_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_enterprise') - # m.connect('datajsonld', JsonExportPlugin.route_ld_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_jsonld') + # m.connect('enterprisedatajson', JsonExportPlugin.route_edata_path, controller='ckanext.datajson.plugin:JsonExportController', action='generate_enterprise') + # m.connect('datajsonld', JsonExportPlugin.route_ld_path, controller='ckanext.datajson.plugin:JsonExportController', action='generate_jsonld') # TODO DWC update action # /data/{org}/data.json m.connect('public_data_listing', '/organization/{org}/data.json', - controller='ckanext.datajson.plugin:DataJsonController', action='generate_pdl') + controller='ckanext.datajson.plugin:JsonExportController', action='generate_pdl') # TODO DWC update action # /data/{org}/edi.json m.connect('enterprise_data_inventory', '/organization/{org}/edi.json', - controller='ckanext.datajson.plugin:DataJsonController', action='generate_edi') + controller='ckanext.datajson.plugin:JsonExportController', action='generate_edi') # /pod/validate - # m.connect('datajsonvalidator', "/pod/validate", controller='ckanext.datajson.plugin:DataJsonController', action='validator') + # m.connect('datajsonvalidator', "/pod/validate", controller='ckanext.datajson.plugin:JsonExportController', action='validator') return m -class DataJsonController(BaseController): +class JsonExportController(BaseController): def generate_output(self, format): # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -106,7 +92,7 @@ def generate_output(self, format): # TODO special processing for enterprise # output - data = make_json() + data = self.make_json() if format == 'json-ld': # Convert this to JSON-LD. @@ -179,7 +165,7 @@ def generate_pdl(self): # allow caching of response (e.g. by Apache) del response.headers["Cache-Control"] del response.headers["Pragma"] - return make_pdl(match.group(1)) + return self.make_pdl(match.group(1)) return "Invalid organization id" def generate_edi(self): @@ -195,169 +181,187 @@ def generate_edi(self): # allow caching of response (e.g. by Apache) del response.headers["Cache-Control"] del response.headers["Pragma"] - return make_edi(match.group(1)) + return self.make_edi(match.group(1)) return "Invalid organization id" -def make_json(): - # Build the data.json file. - packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) - output = [] - # Create data.json only using public and public-restricted datasets, datasets marked non-public are not exposed - for pkg in packages: - extras = dict([(x['key'], x['value']) for x in pkg['extras']]) - try: - if not (re.match(r'[Nn]on-public', extras['public_access_level'])): - datajson_entry = make_datajson_entry(pkg) - if datajson_entry: - output.append(datajson_entry) - else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) - except KeyError: - logger.warn("Dataset id=[%s], title=[%s] missing required 'public_access_level' field", pkg.get('id', None), - pkg.get('title', None)) - pass - return output - - -def make_edi(owner_org): - # Error handler for creating error log - stream = StringIO.StringIO() - eh = logging.StreamHandler(stream) - eh.setLevel(logging.WARN) - formatter = logging.Formatter('%(asctime)s - %(message)s') - eh.setFormatter(formatter) - logger.addHandler(eh) - - # Build the data.json file. - packages = get_packages(owner_org) - - output = [] - for pkg in packages: - # if pkg['owner_org'] == owner_org: - datajson_entry = make_datajson_entry(pkg) - if datajson_entry and is_valid(datajson_entry): - output.append(datajson_entry) - else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) - - # Get the error log - eh.flush() - error = stream.getvalue() - eh.close() - logger.removeHandler(eh) - stream.close() - - # return json.dumps(output) - return write_zip(output, error, zip_name='edi') - - -def make_pdl(owner_org): - # Error handler for creating error log - stream = StringIO.StringIO() - eh = logging.StreamHandler(stream) - eh.setLevel(logging.WARN) - formatter = logging.Formatter('%(asctime)s - %(message)s') - eh.setFormatter(formatter) - logger.addHandler(eh) - - # Build the data.json file. - packages = get_packages(owner_org) - - output = [] - # Create data.json only using public datasets, datasets marked non-public are not exposed - for pkg in packages: - extras = dict([(x['key'], x['value']) for x in pkg['extras']]) - try: - if not (re.match(r'[Nn]on-public', extras['public_access_level'])): - datajson_entry = make_datajson_entry(pkg) - if datajson_entry and is_valid(datajson_entry): - output.append(datajson_entry) - else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) - - except KeyError: - logger.warn("Dataset id=[%s], title=['%s'] missing required 'public_access_level' field", - pkg.get('id', None), pkg.get('title', None)) - pass - - # Get the error log - eh.flush() - error = stream.getvalue() - eh.close() - logger.removeHandler(eh) - stream.close() - - # return json.dumps(output) - return write_zip(output, error, zip_name='pdl') - - -def get_packages(owner_org): - # Build the data.json file. - packages = get_all_group_packages(group_id=owner_org) - # get packages for sub-agencies. - sub_agency = model.Group.get(owner_org) - if 'sub-agencies' in sub_agency.extras.col.target and \ - sub_agency.extras.col.target['sub-agencies'].state == 'active': - sub_agencies = sub_agency.extras.col.target['sub-agencies'].value - sub_agencies_list = sub_agencies.split(",") - for sub in sub_agencies_list: - sub_packages = get_all_group_packages(group_id=sub) - for sub_package in sub_packages: - packages.append(sub_package) - - return packages - - -def get_all_group_packages(group_id): - """ - Gets all of the group packages, public or private, returning them as a list of CKAN's dictized packages. - """ - result = [] - for pkg_rev in model.Group.get(group_id).packages(with_private=True, context={'user_is_admin': True}): - result.append(model_dictize.package_dictize(pkg_rev, {'model': model})) - - return result - - -def is_valid(instance): - """ - Validates a data.json entry against the project open data's JSON schema. Log a warning message on validation error - """ - error = best_match(validator.iter_errors(instance)) - if error: - logger.warn("Validation failed, best guess of error = %s", error) - return False - return True - - -def write_zip(data, error=None, zip_name='data'): - """ - Data: a python object to write to the data.json - Error: unicode string representing the content of the error log. - zip_name: the name to use for the zip file - """ - import zipfile - - o = StringIO.StringIO() - zf = zipfile.ZipFile(o, mode='w') - - # Write the data file - if data: - zf.writestr('data.json', json.dumps(make_datajson_catalog(data), ensure_ascii=False).encode('utf8')) - - # Write the error log - if error: - zf.writestr('errorlog.txt', error.encode('utf8')) - - zf.close() - o.seek(0) - - binary = o.read() - o.close() - - response.content_type = 'application/octet-stream' - response.content_disposition = 'attachment; filename="%s.zip"' % zip_name - - return binary + def make_json(self): + # Build the data.json file. + packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) + output = [] + # Create data.json only using public and public-restricted datasets, datasets marked non-public are not exposed + for pkg in packages: + extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + try: + if not (re.match(r'[Nn]on-public', extras['public_access_level'])): + datajson_entry = make_datajson_export_entry(pkg) + if datajson_entry: + output.append(datajson_entry) + else: + logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None)) + except KeyError: + logger.warn("Dataset id=[%s], title=[%s] missing required 'public_access_level' field", + pkg.get('id', None), + pkg.get('title', None)) + pass + return output + + + def make_edi(self, owner_org): + # Error handler for creating error log + stream = StringIO.StringIO() + eh = logging.StreamHandler(stream) + eh.setLevel(logging.WARN) + formatter = logging.Formatter('%(asctime)s - %(message)s') + eh.setFormatter(formatter) + logger.addHandler(eh) + + # Build the data.json file. + packages = self.get_packages(owner_org) + + output = [] + for pkg in packages: + # if pkg['owner_org'] == owner_org: + datajson_entry = make_datajson_export_entry(pkg) + if datajson_entry and self.is_valid(datajson_entry): + output.append(datajson_entry) + else: + logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) + + # Get the error log + eh.flush() + error = stream.getvalue() + eh.close() + logger.removeHandler(eh) + stream.close() + + # return json.dumps(output) + return self.write_zip(output, error, zip_name='edi') + + + def make_pdl(self, owner_org): + # Error handler for creating error log + stream = StringIO.StringIO() + eh = logging.StreamHandler(stream) + eh.setLevel(logging.WARN) + formatter = logging.Formatter('%(asctime)s - %(message)s') + eh.setFormatter(formatter) + logger.addHandler(eh) + + # Build the data.json file. + packages = self.get_packages(owner_org) + + output = [] + # Create data.json only using public datasets, datasets marked non-public are not exposed + for pkg in packages: + extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + try: + if not (re.match(r'[Nn]on-public', extras['public_access_level'])): + datajson_entry = make_datajson_export_entry(pkg) + if datajson_entry and self.is_valid(datajson_entry): + output.append(datajson_entry) + else: + logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None)) + + except KeyError: + logger.warn("Dataset id=[%s], title=['%s'] missing required 'public_access_level' field", + pkg.get('id', None), pkg.get('title', None)) + pass + + # Get the error log + eh.flush() + error = stream.getvalue() + eh.close() + logger.removeHandler(eh) + stream.close() + + # return json.dumps(output) + return self.write_zip(output, error, zip_name='pdl') + + + def get_packages(self, owner_org): + # Build the data.json file. + packages = self.get_all_group_packages(group_id=owner_org) + # get packages for sub-agencies. + sub_agency = model.Group.get(owner_org) + if 'sub-agencies' in sub_agency.extras.col.target and \ + sub_agency.extras.col.target['sub-agencies'].state == 'active': + sub_agencies = sub_agency.extras.col.target['sub-agencies'].value + sub_agencies_list = sub_agencies.split(",") + for sub in sub_agencies_list: + sub_packages = self, self.get_all_group_packages(group_id=sub) + for sub_package in sub_packages: + packages.append(sub_package) + + return packages + + + def get_all_group_packages(self, group_id): + """ + Gets all of the group packages, public or private, returning them as a list of CKAN's dictized packages. + """ + result = [] + for pkg_rev in model.Group.get(group_id).packages(with_private=True, context={'user_is_admin': True}): + result.append(model_dictize.package_dictize(pkg_rev, {'model': model})) + + return result + + + def is_valid(self, instance): + """ + Validates a data.json entry against the project open data's JSON schema. Log a warning message on validation error + """ + error = best_match(validator.iter_errors(instance)) + if error: + logger.warn("Validation failed, best guess of error = %s", error) + return False + return True + + + def write_zip(self, data, error=None, zip_name='data'): + """ + Data: a python object to write to the data.json + Error: unicode string representing the content of the error log. + zip_name: the name to use for the zip file + """ + import zipfile + + o = StringIO.StringIO() + zf = zipfile.ZipFile(o, mode='w') + + # Write the data file + if data: + zf.writestr('data.json', json.dumps(make_datajson_export_catalog(data), ensure_ascii=False).encode('utf8')) + + # Write the error log + if error: + zf.writestr('errorlog.txt', error.encode('utf8')) + + zf.close() + o.seek(0) + + binary = o.read() + o.close() + + response.content_type = 'application/octet-stream' + response.content_disposition = 'attachment; filename="%s.zip"' % zip_name + + return binary + + +def get_validator(): + import os + from jsonschema import Draft4Validator, FormatChecker + + schema_path = os.path.join(os.path.dirname(__file__), 'pod_schema', 'federal-v1.1', 'dataset.json') + with open(schema_path, 'r') as file: + schema = json.loads(file.read()) + return Draft4Validator(schema, format_checker=FormatChecker()) + + logger.warn('Unable to create validator') + return None + +validator = get_validator() \ No newline at end of file From 28208d33f07fd156ff690b2fe8da127482f1e8c2 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Thu, 12 Mar 2015 15:40:13 -0400 Subject: [PATCH 06/22] replaced schema path to catalog structure --- ckanext/datajson/build_datajson.py | 843 ++++++++++-------- ckanext/datajson/build_datajsonld.py | 29 +- ckanext/datajson/plugin.py | 14 +- .../pod_schema/federal-v1.1/dataset.json | 10 +- .../non-federal-v1.1/dataset-non-federal.json | 8 +- 5 files changed, 517 insertions(+), 387 deletions(-) diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index 17b1c8f5..d4e472eb 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -11,299 +11,64 @@ log = logging.getLogger('datajson') -# TODO this file is pretty sloppy, needs cleanup and redundancies removed - -def make_datajson_export_catalog(datasets): - catalog = OrderedDict([ - ('conformsTo', 'https://project-open-data.cio.gov/v1.1/schema'), # requred - ('describedBy', 'https://project-open-data.cio.gov/v1.1/schema/catalog.json'), # optional - ('@context', 'https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld'), # optional - ('@type', 'dcat:Catalog'), # optional - ('dataset', datasets), # required - ]) - return catalog - - -def make_datajson_export_entry(package): - # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict - extras = dict([(x['key'], x['value']) for x in package['extras']]) - - parent_dataset_id = extras.get('parent_dataset') - if parent_dataset_id: - parent = model.Package.get(parent_dataset_id) - parent_uid = parent.extras.col.target['unique_id'].value - if parent_uid: - parent_dataset_id = parent_uid - - # if resource format is CSV then convert it to text/csv - # Resource format has to be in 'csv' format for automatic datastore push. - for r in package["resources"]: - if r["format"].lower() == "csv": - r["format"] = "text/csv" - if r["format"].lower() == "json": - r["format"] = "application/json" - if r["format"].lower() == "pdf": - r["format"] = "application/pdf" - - try: - retlist = [ - ("@type", "dcat:Dataset"), # optional - - ("title", strip_if_string(package["title"])), # required - - # ("accessLevel", 'public'), # required - ("accessLevel", strip_if_string(extras.get('public_access_level'))), # required - - # ("accrualPeriodicity", "R/P1Y"), # optional - # ('accrualPeriodicity', 'accrual_periodicity'), - ('accrualPeriodicity', get_accrual_periodicity(extras.get('accrual_periodicity'))), # optional - - ("conformsTo", strip_if_string(extras.get('conforms_to'))), # optional - - # ('contactPoint', OrderedDict([ - # ("@type", "vcard:Contact"), - # ("fn", "Jane Doe"), - # ("hasEmail", "mailto:jane.doe@agency.gov") - # ])), # required - ('contactPoint', get_contact_point(extras, package)), # required - - ("dataQuality", strip_if_string(extras.get('data_quality'))), # required-if-applicable - - ("describedBy", strip_if_string(extras.get('data_dictionary'))), # optional - ("describedByType", strip_if_string(extras.get('data_dictionary_type'))), # optional - - ("description", strip_if_string(package["notes"])), # required - - # ("description", 'asdfasdf'), # required - - ("identifier", strip_if_string(extras.get('unique_id'))), # required - # ("identifier", 'asdfasdfasdf'), # required - - ("isPartOf", parent_dataset_id), # optional - ("issued", strip_if_string(extras.get('release_date'))), # optional - - # ("keyword", ['a', 'b']), # required - ("keyword", [t["display_name"] for t in package["tags"]]), # required - - ("landingPage", strip_if_string(extras.get('homepage_url'))), # optional - - ("license", strip_if_string(extras.get("license_new"))), # required-if-applicable - - ("modified", strip_if_string(extras.get("modified"))), # required - - ("primaryITInvestmentUII", strip_if_string(extras.get('primary_it_investment_uii'))), # optional - - # ('publisher', OrderedDict([ - # ("@type", "org:Organization"), - # ("name", "Widget Services") - # ])), # required - # ("publisher", get_publisher_tree(extras)), # required - ("publisher", get_publisher_tree_wrong_order(extras)), # required - - ("rights", strip_if_string(extras.get('access_level_comment'))), # required - - ("spatial", strip_if_string(package.get("spatial"))), # required-if-applicable - - ('systemOfRecords', strip_if_string(extras.get('system_of_records'))), # optional - - ("temporal", strip_if_string(extras.get('temporal'))), # required-if-applicable - - ("distribution", generate_distribution(package)), # required-if-applicable - - # ("distribution", - # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format) - # [ - # OrderedDict([ - # ("downloadURL", r["url"]), - # ("mediaType", r["formatReadable"]), - # ]) - # for r in package["resources"] - # ]) - ] - - for pair in [ - ('bureauCode', 'bureau_code'), # required - ('language', 'language'), # optional - ('programCode', 'program_code'), # required - ('references', 'related_documents'), # optional - ('theme', 'category'), # optional - ]: - split_multiple_entries(retlist, extras, pair) - - except KeyError as e: - log.warn("Invalid field detected for package with id=[%s], title=['%s']: '%s'", package.get('id'), - package.get('title'), e) - return - - # # TODO this is a lazy hack to make sure we don't have redundant fields when the free form key/value pairs are added - # extras_to_filter_out = ['publisher', 'contact_name', 'contact_email', 'unique_id', 'public_access_level', - # 'data_dictionary', 'bureau_code', 'program_code', 'access_level_comment', 'license_title', - # 'spatial', 'temporal', 'release_date', 'accrual_periodicity', 'language', 'granularity', - # 'data_quality', 'size', 'homepage_url', 'rss_feed', 'category', 'related_documents', - # 'system_of_records', 'system_of_records_none_related_to_this_dataset', 'tags', - # 'extrasRollup', 'format', 'accessURL', 'notes', 'publisher_1', 'publisher_2', 'publisher_3', - # 'publisher_4', 'publisher_5'] - # - # # Append any free extras (key/value pairs) that aren't part of common core but have been associated with the dataset - # # TODO really hackey, short on time, had to hardcode a lot of the names to remove. there's much better ways, maybe - # # generate a list of keys to ignore by calling a specific function to get the extras - # retlist_keys = [x for x, y in retlist] - # extras_keys = set(extras.keys()) - set(extras_to_filter_out) - # - # for key in extras_keys: - # convertedKey = underscore_to_camelcase(key) - # if convertedKey not in retlist_keys: - # retlist.append((convertedKey, extras[key])) - - # Remove entries where value is None, "", or empty list [] - striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []] - striped_retlist_keys = [x for x, y in striped_retlist] - - - # If a required metadata field was removed, return empty string - # for required_field in ["accessLevel", "bureauCode", "contactPoint", "description", "identifier", "keyword", - # "modified", "programCode", "publisher", "title"]: - # if required_field not in striped_retlist_keys: - # log.warn("Missing required field detected for package with id=[%s], title=['%s']: '%s'", - # package.get('id'), package.get('title'), required_field) - # return - - # When saved from UI DataQuality value is stored as "on" instead of True. - # Check if value is "on" and replace it with True. - striped_retlist_dict = OrderedDict(striped_retlist) - if striped_retlist_dict.get('dataQuality') == "on" \ - or striped_retlist_dict.get('dataQuality') == "true" \ - or striped_retlist_dict.get('dataQuality') == "True": - striped_retlist_dict['dataQuality'] = True - elif striped_retlist_dict.get('dataQuality') == "false" \ - or striped_retlist_dict.get('dataQuality') == "False": - striped_retlist_dict['dataQuality'] = False - - from datajsonvalidator import do_validation - - errors = [] - try: - do_validation([dict(striped_retlist_dict)], errors) - except Exception as e: - errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) - if len(errors) > 0: - for error in errors: - log.warn(error) - return - - return striped_retlist_dict - - -# used by get_accrual_periodicity -accrual_periodicity_dict = { - 'completely irregular': 'irregular', - 'decennial': 'R/P10Y', - 'quadrennial': 'R/P4Y', - 'annual': 'R/P1Y', - 'bimonthly': 'R/P2M', # or R/P0.5M - 'semiweekly': 'R/P3.5D', - 'daily': 'R/P1D', - 'biweekly': 'R/P2W', # or R/P0.5W - 'semiannual': 'R/P6M', - 'biennial': 'R/P2Y', - 'triennial': 'R/P3Y', - 'three times a week': 'R/P0.33W', - 'three times a month': 'R/P0.33M', - 'continuously updated': 'R/PT1S', - 'monthly': 'R/P1M', - 'quarterly': 'R/P3M', - 'semimonthly': 'R/P0.5M', - 'three times a year': 'R/P4M', - 'weekly': 'R/P1W' -} - - -def get_accrual_periodicity(frequency): - return accrual_periodicity_dict.get(str(frequency).lower().strip(), frequency) - - -def generate_distribution(package): - arr = [] - for r in package["resources"]: - resource = [("@type", "dcat:Distribution")] - rkeys = r.keys() - if 'url' in rkeys: - res_url = strip_if_string(r.get('url')) - if res_url: - if 'api' == r.get('resource_type') or 'accessurl' == r.get('resource_type'): - resource += [("accessURL", res_url)] - else: - resource += [("downloadURL", res_url)] - if 'format' in rkeys: - res_format = strip_if_string(r.get('format')) - if res_format: - resource += [("mediaType", res_format)] - else: - log.warn("Missing mediaType for resource in package ['%s']", package.get('id')) - else: - log.warn("Missing downloadURL for resource in package ['%s']", package.get('id')) - - # if 'accessURL_new' in rkeys: - # res_access_url = strip_if_string(r.get('accessURL_new')) - # if res_access_url: - # resource += [("accessURL", res_access_url)] - - if 'formatReadable' in rkeys: - res_attr = strip_if_string(r.get('formatReadable')) - if res_attr: - resource += [("format", res_attr)] - - if 'name' in rkeys: - res_attr = strip_if_string(r.get('name')) - if res_attr: - resource += [("title", res_attr)] - - if 'notes' in rkeys: - res_attr = strip_if_string(r.get('notes')) - if res_attr: - resource += [("description", res_attr)] - - if 'conformsTo' in rkeys: - res_attr = strip_if_string(r.get('conformsTo')) - if res_attr: - resource += [("conformsTo", res_attr)] - - if 'describedBy' in rkeys: - res_attr = strip_if_string(r.get('describedBy')) - if res_attr: - resource += [("describedBy", res_attr)] - - if 'describedByType' in rkeys: - res_attr = strip_if_string(r.get('describedByType')) - if res_attr: - resource += [("describedByType", res_attr)] - - striped_resource = [(x, y) for x, y in resource if y is not None and y != "" and y != []] - - arr += [OrderedDict(striped_resource)] - - return arr - - -def get_contact_point(extras, package): - for required_field in ["contact_name", "contact_email"]: - if required_field not in extras.keys(): - raise KeyError(required_field) - - email = strip_if_string(extras['contact_email']) - if email is None or '@' not in email: - raise KeyError(required_field) - fn = strip_if_string(extras['contact_name']) - if fn is None: - raise KeyError(required_field) - - contact_point = OrderedDict([ - ('@type', 'vcard:Contact'), # optional - ('fn', fn), # required - ('hasEmail', 'mailto:' + email), # required +def get_facet_fields(): + # Return fields that we'd like to add to default CKAN faceting. This really has + # nothing to do with exporting data.json but it's probably a common consideration. + facets = OrderedDict() + facets[ + # using "author" produces weird results because the Solr schema indexes it as "text" rather than "string" + "Agency"] = "Publishers" + # search facets remove spaces from field names + facets["SubjectArea1"] = "Subjects" + return facets + + +def make_datajson_entry(package): + return OrderedDict([ + ("title", package["title"]), + ("description", package["notes"]), + ("keyword", [t["display_name"] for t in package["tags"]]), + ("modified", extra(package, "Date Updated")), + ("publisher", package["author"]), + ("bureauCode", extra(package, "Bureau Code").split(" ") if extra(package, "Bureau Code") else None), + ("programCode", extra(package, "Program Code").split(" ") if extra(package, "Program Code") else None), + ("contactPoint", extra(package, "Contact Name")), + ("mbox", extra(package, "Contact Email")), + ("identifier", package["id"]), + ("accessLevel", extra(package, "Access Level", default="public")), + ("accessLevelComment", extra(package, "Access Level Comment")), + ("dataDictionary", extra(package, "Data Dictionary")), + ("accessURL", get_primary_resource(package).get("url", None)), + ("webService", get_api_resource(package).get("url", None)), + ("format", extension_to_mime_type(get_primary_resource(package).get("format", None)) ), + ("license", extra(package, "License Agreement")), + ("spatial", extra(package, "Geographic Scope")), + ("temporal", build_temporal(package)), + ("issued", extra(package, "Date Released")), + ("accrualPeriodicity", extra(package, "Publish Frequency")), + ("language", extra(package, "Language")), + ("PrimaryITInvestmentUII", extra(package, "PrimaryITInvestmentUII")), + ("granularity", "/".join( + x for x in [extra(package, "Unit of Analysis"), extra(package, "Geographic Granularity")] if x != None)), + ("dataQuality", extra(package, "Data Quality Met", default="true") == "true"), + ("theme", [s for s in ( + extra(package, "Subject Area 1"), extra(package, "Subject Area 2"), extra(package, "Subject Area 3")) if + s != None]), + ("references", [s for s in [extra(package, "Technical Documentation")] if s != None]), + ("landingPage", package["url"]), + ("systemOfRecords", extra(package, "System Of Records")), + ("distribution", + [ + OrderedDict([ + ("identifier", r["id"]), # NOT in POD standard, but useful for conversion to JSON-LD + ("accessURL", r["url"]), + ("format", r.get("mimetype", extension_to_mime_type(r["format"]))), + ]) + for r in package["resources"] + if r["format"].lower() not in ("api", "query tool", "widget") + ]), ]) - return contact_point def extra(package, key, default=None): @@ -314,72 +79,23 @@ def extra(package, key, default=None): return default -def get_publisher_tree_wrong_order(extras): - publisher = strip_if_string(extras.get('publisher')) - if publisher is None: - raise KeyError('publisher') - - organization_list = list() - organization_list.append([ - ('@type', 'org:Organization'), # optional - ('name', publisher), # required - ]) - - for i in range(1, 6): - key = 'publisher_' + str(i) - if key in extras and extras[key] and strip_if_string(extras[key]): - organization_list.append([ - ('@type', 'org:Organization'), # optional - ('name', strip_if_string(extras[key])), # required - ]) - - size = len(organization_list) - - # [OSCIT, GSA] - # organization_list.reverse() - # [GSA, OSCIT] - - tree = False - for i in range(0, size): - if tree: - organization_list[i] += [('subOrganizationOf', OrderedDict(tree))] - tree = organization_list[i] - - return OrderedDict(tree) - - -def underscore_to_camelcase(value): - """ - Convert underscored strings to camel case, e.g. one_two_three to oneTwoThree - """ - - def camelcase(): - yield unicode.lower - while True: - yield unicode.capitalize - - c = camelcase() - return "".join(c.next()(x) if x else '_' for x in value.split("_")) - - -def get_best_resource(package, acceptable_formats): +def get_best_resource(package, acceptable_formats, unacceptable_formats=None): resources = list(r for r in package["resources"] if r["format"].lower() in acceptable_formats) - if len(resources) == 0: return {} - resources.sort(key=lambda r: acceptable_formats.index(r["format"].lower())) + if len(resources) == 0: + if unacceptable_formats: + # try at least any resource that's not unacceptable + resources = list(r for r in package["resources"] if r["format"].lower() not in unacceptable_formats) + if len(resources) == 0: + # there is no acceptable resource to show + return {} + else: + resources.sort(key=lambda r: acceptable_formats.index(r["format"].lower())) return resources[0] -def strip_if_string(val): - if isinstance(val, (str, unicode)): - val = val.strip() - if '' == val: - val = None - return val - - def get_primary_resource(package): # Return info about a "primary" resource. Select a good one. - return get_best_resource(package, ("csv", "xls", "xml", "text", "zip", "rdf")) + return get_best_resource(package, ("csv", "xls", "xml", "text", "zip", "rdf"), ("api", "query tool", "widget")) def get_api_resource(package): @@ -387,9 +103,422 @@ def get_api_resource(package): return get_best_resource(package, ("api", "query tool")) -def split_multiple_entries(retlist, extras, names): - found_element = string.strip(extras.get(names[1], "")) - if found_element: - retlist.append( - (names[0], [string.strip(x) for x in string.split(found_element, ',')]) - ) +def build_temporal(package): + # Build one dataset entry of the data.json file. + temporal = "" + if extra(package, "Coverage Period Fiscal Year Start"): + temporal = "FY" + extra(package, "Coverage Period Fiscal Year Start").replace(" ", "T").replace("T00:00:00", "") + else: + temporal = extra(package, "Coverage Period Start", "Unknown").replace(" ", "T").replace("T00:00:00", "") + temporal += "/" + if extra(package, "Coverage Period Fiscal Year End"): + temporal += "FY" + extra(package, "Coverage Period Fiscal Year End").replace(" ", "T").replace("T00:00:00", "") + else: + temporal += extra(package, "Coverage Period End", "Unknown").replace(" ", "T").replace("T00:00:00", "") + if temporal == "Unknown/Unknown": return None + return temporal + + +def extension_to_mime_type(file_ext): + if file_ext is None: return None + ext = { + "csv": "text/csv", + "xls": "application/vnd.ms-excel", + "xml": "application/xml", + "rdf": "application/rdf+xml", + "json": "application/json", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "text": "text/plain", + "feed": "application/rss+xml", + } + return ext.get(file_ext.lower(), "application/unknown") + + +class JsonExportBuilder: + @staticmethod + def make_datajson_export_catalog(datasets): + catalog = OrderedDict([ + ('conformsTo', 'https://project-open-data.cio.gov/v1.1/schema'), # requred + ('describedBy', 'https://project-open-data.cio.gov/v1.1/schema/catalog.json'), # optional + ('@context', 'https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld'), # optional + ('@type', 'dcat:Catalog'), # optional + ('dataset', datasets), # required + ]) + return catalog + + @staticmethod + def make_datajson_export_entry(package): + # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict + extras = dict([(x['key'], x['value']) for x in package['extras']]) + + parent_dataset_id = extras.get('parent_dataset') + if parent_dataset_id: + parent = model.Package.get(parent_dataset_id) + parent_uid = parent.extras.col.target['unique_id'].value + if parent_uid: + parent_dataset_id = parent_uid + + # if resource format is CSV then convert it to text/csv + # Resource format has to be in 'csv' format for automatic datastore push. + for r in package["resources"]: + if r["format"].lower() == "csv": + r["format"] = "text/csv" + if r["format"].lower() == "json": + r["format"] = "application/json" + if r["format"].lower() == "pdf": + r["format"] = "application/pdf" + + try: + retlist = [ + ("@type", "dcat:Dataset"), # optional + + ("title", JsonExportBuilder.strip_if_string(package["title"])), # required + + # ("accessLevel", 'public'), # required + ("accessLevel", JsonExportBuilder.strip_if_string(extras.get('public_access_level'))), # required + + # ("accrualPeriodicity", "R/P1Y"), # optional + # ('accrualPeriodicity', 'accrual_periodicity'), + ('accrualPeriodicity', JsonExportBuilder.get_accrual_periodicity(extras.get('accrual_periodicity'))), + # optional + + ("conformsTo", JsonExportBuilder.strip_if_string(extras.get('conforms_to'))), # optional + + # ('contactPoint', OrderedDict([ + # ("@type", "vcard:Contact"), + # ("fn", "Jane Doe"), + # ("hasEmail", "mailto:jane.doe@agency.gov") + # ])), # required + ('contactPoint', JsonExportBuilder.get_contact_point(extras, package)), # required + + ("dataQuality", JsonExportBuilder.strip_if_string(extras.get('data_quality'))), + # required-if-applicable + + ("describedBy", JsonExportBuilder.strip_if_string(extras.get('data_dictionary'))), # optional + ("describedByType", JsonExportBuilder.strip_if_string(extras.get('data_dictionary_type'))), # optional + + ("description", JsonExportBuilder.strip_if_string(package["notes"])), # required + + # ("description", 'asdfasdf'), # required + + ("identifier", JsonExportBuilder.strip_if_string(extras.get('unique_id'))), # required + # ("identifier", 'asdfasdfasdf'), # required + + ("isPartOf", parent_dataset_id), # optional + ("issued", JsonExportBuilder.strip_if_string(extras.get('release_date'))), # optional + + # ("keyword", ['a', 'b']), # required + ("keyword", [t["display_name"] for t in package["tags"]]), # required + + ("landingPage", JsonExportBuilder.strip_if_string(extras.get('homepage_url'))), # optional + + ("license", JsonExportBuilder.strip_if_string(extras.get("license_new"))), # required-if-applicable + + ("modified", JsonExportBuilder.strip_if_string(extras.get("modified"))), # required + + ("primaryITInvestmentUII", JsonExportBuilder.strip_if_string(extras.get('primary_it_investment_uii'))), + # optional + + # ('publisher', OrderedDict([ + # ("@type", "org:Organization"), + # ("name", "Widget Services") + # ])), # required + # ("publisher", get_publisher_tree(extras)), # required + ("publisher", JsonExportBuilder.get_publisher_tree_wrong_order(extras)), # required + + ("rights", JsonExportBuilder.strip_if_string(extras.get('access_level_comment'))), # required + + ("spatial", JsonExportBuilder.strip_if_string(package.get("spatial"))), # required-if-applicable + + ('systemOfRecords', JsonExportBuilder.strip_if_string(extras.get('system_of_records'))), # optional + + ("temporal", JsonExportBuilder.strip_if_string(extras.get('temporal'))), # required-if-applicable + + ("distribution", JsonExportBuilder.generate_distribution(package)), # required-if-applicable + + # ("distribution", + # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format) + # [ + # OrderedDict([ + # ("downloadURL", r["url"]), + # ("mediaType", r["formatReadable"]), + # ]) + # for r in package["resources"] + # ]) + ] + + for pair in [ + ('bureauCode', 'bureau_code'), # required + ('language', 'language'), # optional + ('programCode', 'program_code'), # required + ('references', 'related_documents'), # optional + ('theme', 'category'), # optional + ]: + JsonExportBuilder.split_multiple_entries(retlist, extras, pair) + + except KeyError as e: + log.warn("Invalid field detected for package with id=[%s], title=['%s']: '%s'", package.get('id'), + package.get('title'), e) + return + + # # TODO this is a lazy hack to make sure we don't have redundant fields when the free form key/value pairs are added + # extras_to_filter_out = ['publisher', 'contact_name', 'contact_email', 'unique_id', 'public_access_level', + # 'data_dictionary', 'bureau_code', 'program_code', 'access_level_comment', 'license_title', + # 'spatial', 'temporal', 'release_date', 'accrual_periodicity', 'language', 'granularity', + # 'data_quality', 'size', 'homepage_url', 'rss_feed', 'category', 'related_documents', + # 'system_of_records', 'system_of_records_none_related_to_this_dataset', 'tags', + # 'extrasRollup', 'format', 'accessURL', 'notes', 'publisher_1', 'publisher_2', 'publisher_3', + # 'publisher_4', 'publisher_5'] + # + # # Append any free extras (key/value pairs) that aren't part of common core but have been associated with the dataset + # # TODO really hackey, short on time, had to hardcode a lot of the names to remove. there's much better ways, maybe + # # generate a list of keys to ignore by calling a specific function to get the extras + # retlist_keys = [x for x, y in retlist] + # extras_keys = set(extras.keys()) - set(extras_to_filter_out) + # + # for key in extras_keys: + # convertedKey = underscore_to_camelcase(key) + # if convertedKey not in retlist_keys: + # retlist.append((convertedKey, extras[key])) + + # Remove entries where value is None, "", or empty list [] + striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []] + striped_retlist_keys = [x for x, y in striped_retlist] + + + # If a required metadata field was removed, return empty string + # for required_field in ["accessLevel", "bureauCode", "contactPoint", "description", "identifier", "keyword", + # "modified", "programCode", "publisher", "title"]: + # if required_field not in striped_retlist_keys: + # log.warn("Missing required field detected for package with id=[%s], title=['%s']: '%s'", + # package.get('id'), package.get('title'), required_field) + # return + + # When saved from UI DataQuality value is stored as "on" instead of True. + # Check if value is "on" and replace it with True. + striped_retlist_dict = OrderedDict(striped_retlist) + if striped_retlist_dict.get('dataQuality') == "on" \ + or striped_retlist_dict.get('dataQuality') == "true" \ + or striped_retlist_dict.get('dataQuality') == "True": + striped_retlist_dict['dataQuality'] = True + elif striped_retlist_dict.get('dataQuality') == "false" \ + or striped_retlist_dict.get('dataQuality') == "False": + striped_retlist_dict['dataQuality'] = False + + from datajsonvalidator import do_validation + + errors = [] + try: + do_validation([dict(striped_retlist_dict)], errors) + except Exception as e: + errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) + if len(errors) > 0: + for error in errors: + log.warn(error) + return + + return striped_retlist_dict + + + # used by get_accrual_periodicity + accrual_periodicity_dict = { + 'completely irregular': 'irregular', + 'decennial': 'R/P10Y', + 'quadrennial': 'R/P4Y', + 'annual': 'R/P1Y', + 'bimonthly': 'R/P2M', # or R/P0.5M + 'semiweekly': 'R/P3.5D', + 'daily': 'R/P1D', + 'biweekly': 'R/P2W', # or R/P0.5W + 'semiannual': 'R/P6M', + 'biennial': 'R/P2Y', + 'triennial': 'R/P3Y', + 'three times a week': 'R/P0.33W', + 'three times a month': 'R/P0.33M', + 'continuously updated': 'R/PT1S', + 'monthly': 'R/P1M', + 'quarterly': 'R/P3M', + 'semimonthly': 'R/P0.5M', + 'three times a year': 'R/P4M', + 'weekly': 'R/P1W' + } + + @staticmethod + def get_accrual_periodicity(frequency): + return JsonExportBuilder.accrual_periodicity_dict.get(str(frequency).lower().strip(), frequency) + + @staticmethod + def generate_distribution(package): + arr = [] + for r in package["resources"]: + resource = [("@type", "dcat:Distribution")] + rkeys = r.keys() + if 'url' in rkeys: + res_url = JsonExportBuilder.strip_if_string(r.get('url')) + if res_url: + if 'api' == r.get('resource_type') or 'accessurl' == r.get('resource_type'): + resource += [("accessURL", res_url)] + else: + resource += [("downloadURL", res_url)] + if 'format' in rkeys: + res_format = JsonExportBuilder.strip_if_string(r.get('format')) + if res_format: + resource += [("mediaType", res_format)] + else: + log.warn("Missing mediaType for resource in package ['%s']", package.get('id')) + else: + log.warn("Missing downloadURL for resource in package ['%s']", package.get('id')) + + # if 'accessURL_new' in rkeys: + # res_access_url = JsonExportBuilder.strip_if_string(r.get('accessURL_new')) + # if res_access_url: + # resource += [("accessURL", res_access_url)] + + if 'formatReadable' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('formatReadable')) + if res_attr: + resource += [("format", res_attr)] + + if 'name' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('name')) + if res_attr: + resource += [("title", res_attr)] + + if 'notes' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('notes')) + if res_attr: + resource += [("description", res_attr)] + + if 'conformsTo' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('conformsTo')) + if res_attr: + resource += [("conformsTo", res_attr)] + + if 'describedBy' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('describedBy')) + if res_attr: + resource += [("describedBy", res_attr)] + + if 'describedByType' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('describedByType')) + if res_attr: + resource += [("describedByType", res_attr)] + + striped_resource = [(x, y) for x, y in resource if y is not None and y != "" and y != []] + + arr += [OrderedDict(striped_resource)] + + return arr + + @staticmethod + def get_contact_point(extras, package): + for required_field in ["contact_name", "contact_email"]: + if required_field not in extras.keys(): + raise KeyError(required_field) + + email = JsonExportBuilder.strip_if_string(extras['contact_email']) + if email is None or '@' not in email: + raise KeyError(required_field) + + fn = JsonExportBuilder.strip_if_string(extras['contact_name']) + if fn is None: + raise KeyError(required_field) + + contact_point = OrderedDict([ + ('@type', 'vcard:Contact'), # optional + ('fn', fn), # required + ('hasEmail', 'mailto:' + email), # required + ]) + return contact_point + + @staticmethod + def extra(package, key, default=None): + # Retrieves the value of an extras field. + for extra in package["extras"]: + if extra["key"] == key: + return extra["value"] + return default + + @staticmethod + def get_publisher_tree_wrong_order(extras): + publisher = JsonExportBuilder.strip_if_string(extras.get('publisher')) + if publisher is None: + raise KeyError('publisher') + + organization_list = list() + organization_list.append([ + ('@type', 'org:Organization'), # optional + ('name', publisher), # required + ]) + + for i in range(1, 6): + key = 'publisher_' + str(i) + if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): + organization_list.append([ + ('@type', 'org:Organization'), # optional + ('name', JsonExportBuilder.strip_if_string(extras[key])), # required + ]) + + size = len(organization_list) + + # [OSCIT, GSA] + # organization_list.reverse() + # [GSA, OSCIT] + + tree = False + for i in range(0, size): + if tree: + organization_list[i] += [('subOrganizationOf', OrderedDict(tree))] + tree = organization_list[i] + + return OrderedDict(tree) + + @staticmethod + def underscore_to_camelcase(value): + """ + Convert underscored strings to camel case, e.g. one_two_three to oneTwoThree + """ + + def camelcase(): + yield unicode.lower + while True: + yield unicode.capitalize + + c = camelcase() + return "".join(c.next()(x) if x else '_' for x in value.split("_")) + + @staticmethod + def get_best_resource(package, acceptable_formats): + resources = list(r for r in package["resources"] if r["format"].lower() in acceptable_formats) + if len(resources) == 0: return {} + resources.sort(key=lambda r: acceptable_formats.index(r["format"].lower())) + return resources[0] + + @staticmethod + def strip_if_string(val): + if isinstance(val, (str, unicode)): + val = val.strip() + if '' == val: + val = None + return val + + + @staticmethod + def get_primary_resource(package): + # Return info about a "primary" resource. Select a good one. + return JsonExportBuilder.get_best_resource(package, ("csv", "xls", "xml", "text", "zip", "rdf")) + + + @staticmethod + def get_api_resource(package): + # Return info about an API resource. + return JsonExportBuilder.get_best_resource(package, ("api", "query tool")) + + + @staticmethod + def split_multiple_entries(retlist, extras, names): + found_element = string.strip(extras.get(names[1], "")) + if found_element: + retlist.append( + (names[0], [string.strip(x) for x in string.split(found_element, ',')]) + ) diff --git a/ckanext/datajson/build_datajsonld.py b/ckanext/datajson/build_datajsonld.py index fb88f6dc..8a223912 100644 --- a/ckanext/datajson/build_datajsonld.py +++ b/ckanext/datajson/build_datajsonld.py @@ -4,10 +4,10 @@ from sqlalchemy.util import OrderedDict def dataset_to_jsonld(dataset): - from plugin import JsonExportPlugin + from plugin import DataJsonPlugin ret = OrderedDict([ - ("@id", JsonExportPlugin.site_url + "/dataset/" + dataset["identifier"]), + ("@id", DataJsonPlugin.site_url + "/dataset/" + dataset["identifier"]), ("@type", "dcat:Dataset"), ]) @@ -20,9 +20,9 @@ def dataset_to_jsonld(dataset): return ret def distribution_to_jsonld(distribution): - from plugin import JsonExportPlugin + from plugin import DataJsonPlugin ret = OrderedDict([ - ("@id", JsonExportPlugin.site_url + "/resource/" + distribution["identifier"]), + ("@id", DataJsonPlugin.site_url + "/resource/" + distribution["identifier"]), ("@type", "dcat:Distribution"), ]) apply_jsonld_metadata_mapping(distribution, ret) @@ -33,14 +33,18 @@ def distribution_to_jsonld(distribution): "description": "dcterms:description", "keyword": "dcat:keyword", "modified": "dcterms:modified", - "publisher": "dcat:publisher", - "person": "foaf:Person", + "publisher": "dcterms:publisher", + "contactPoint": "dcat:contactPoint", "mbox": "foaf:mbox", "identifier": "dcterms:identifier", + "accessLevel": "pod:accessLevel", + "bureauCode": "pod:bureauCode", + "programCode": "pod:programCode", + "accessLevelComment": "pod:accessLevelComment", "dataDictionary": "dcat:dataDictionary", "accessURL": "dcat:accessURL", - "webService": "dcat:webService", + "webService": "pod:webService", "format": "dcterms:format", # must be a dcterms:MediaTypeOrExtent "license": "dcterms:license", "spatial": "dcterms:spatial", # must be a dcterms:Location entity @@ -49,19 +53,16 @@ def distribution_to_jsonld(distribution): "issued": "dcterms:issued", "accrualPeriodicity": "dcterms:accrualPeriodicity", # must be a dcterms:Frequency "language": "dcat:language", # must be an IRI - "granularity": "dcat:granularity", - "dataQuality": "xsd:boolean", + "dataQuality": "pod:dataQuality", "theme": "dcat:theme", "references": "dcterms:references", - "size": "dcat:size", "landingPage": "dcat:landingPage", - "feed": "dcat:feed", + "systemOfRecords": "pod:systemOfRecords", } jsonld_metadata_datatypes = { "modified": "http://www.w3.org/2001/XMLSchema#dateTime", "issued": "http://www.w3.org/2001/XMLSchema#dateTime", - "size": "http://www.w3.org/2001/XMLSchema#decimal", } def apply_jsonld_metadata_mapping(data, newdict): @@ -72,10 +73,6 @@ def apply_jsonld_metadata_mapping(data, newdict): # skip fields with no mapping to RDF if k not in jsonld_metadata_mapping: continue - # specially handle 'keyword' which in JSON is packed in a comma-separated field - if k == "keyword": - v = v.split(",") - # specially handle literal fields with datatypes if k in jsonld_metadata_datatypes: # Convert ISO datetime format to xsd:dateTime format. diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 6997079c..1fd95b3f 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -18,7 +18,9 @@ except ImportError: from sqlalchemy.util import OrderedDict -from build_datajson import make_datajson_export_entry, make_datajson_export_catalog +from build_datajson import JsonExportBuilder + +from build_datajson import make_datajson_entry, get_facet_fields # from build_enterprisedatajson import make_enterprisedatajson_entry from build_datajsonld import dataset_to_jsonld @@ -194,7 +196,7 @@ def make_json(self): extras = dict([(x['key'], x['value']) for x in pkg['extras']]) try: if not (re.match(r'[Nn]on-public', extras['public_access_level'])): - datajson_entry = make_datajson_export_entry(pkg) + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) if datajson_entry: output.append(datajson_entry) else: @@ -223,7 +225,7 @@ def make_edi(self, owner_org): output = [] for pkg in packages: # if pkg['owner_org'] == owner_org: - datajson_entry = make_datajson_export_entry(pkg) + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: @@ -258,7 +260,7 @@ def make_pdl(self, owner_org): extras = dict([(x['key'], x['value']) for x in pkg['extras']]) try: if not (re.match(r'[Nn]on-public', extras['public_access_level'])): - datajson_entry = make_datajson_export_entry(pkg) + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: @@ -333,7 +335,9 @@ def write_zip(self, data, error=None, zip_name='data'): # Write the data file if data: - zf.writestr('data.json', json.dumps(make_datajson_export_catalog(data), ensure_ascii=False).encode('utf8')) + zf.writestr('data.json', + json.dumps(JsonExportBuilder.make_datajson_export_catalog(data), ensure_ascii=False).encode( + 'utf8')) # Write the error log if error: diff --git a/ckanext/datajson/pod_schema/federal-v1.1/dataset.json b/ckanext/datajson/pod_schema/federal-v1.1/dataset.json index 06fb984c..b9037fb8 100644 --- a/ckanext/datajson/pod_schema/federal-v1.1/dataset.json +++ b/ckanext/datajson/pod_schema/federal-v1.1/dataset.json @@ -77,7 +77,7 @@ "uniqueItems": true }, "contactPoint": { - "$ref": "vcard.json" + "$ref": "#/definitions/vcard" }, "describedBy": { "title": "Data Dictionary", @@ -143,7 +143,7 @@ { "type": "array", "items": { - "$ref": "distribution.json", + "$ref": "#/definitions/distribution", "minItems": 1, "uniqueItems": true } @@ -267,7 +267,7 @@ "uniqueItems": true }, "publisher": { - "$ref": "organization.json" + "$ref": "#/definitions/organization" }, "references": { "title": "Related Documents", @@ -397,7 +397,7 @@ "hasEmail": { "title": "Email", "description": "Email address for the contact", - "pattern": "^mailto:([\\w.-]+@[\\w.-]+\\.[\\w.-]+)?$", + "pattern": "^mailto:[\\w.-]+@[\\w.-]+\\.[\\w.-]+?$", "type": "string" } } @@ -566,7 +566,7 @@ }, "subOrganizationOf": { "title": "Parent Organization", - "$ref": "organization.json" + "$ref": "#" } } } diff --git a/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json b/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json index b0a7f846..3495512b 100644 --- a/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json +++ b/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json @@ -139,7 +139,7 @@ { "type": "array", "items": { - "$ref": "distribution.json", + "$ref": "#/definitions/distribution", "minItems": 1, "uniqueItems": true } @@ -263,7 +263,7 @@ "uniqueItems": true }, "publisher": { - "$ref": "organization.json" + "$ref": "#/definitions/organization" }, "references": { "title": "Related Documents", @@ -392,7 +392,7 @@ "hasEmail": { "title": "Email", "description": "Email address for the contact", - "pattern": "^mailto:([\\w.-]+@[\\w.-]+\\.[\\w.-]+)?$", + "pattern": "^mailto:[\\w.-]+@[\\w.-]+\\.[\\w.-]+?$", "type": "string" } } @@ -561,7 +561,7 @@ }, "subOrganizationOf": { "title": "Parent Organization", - "$ref": "organization.json" + "$ref": "#" } } } From 6a44b6896e8cb73a61f13dbed667b46d183bcffb Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 13 Mar 2015 12:17:14 -0400 Subject: [PATCH 07/22] Catalog changes integrated --- ckanext/datajson/build_datajson.py | 14 +- ckanext/datajson/harvester_base.py | 586 ++++++++++++++++-- .../datajson/harvester_cmsdatanavigator.py | 29 +- ckanext/datajson/harvester_datajson.py | 51 +- ckanext/datajson/parse_datajson.py | 225 ++++--- ckanext/datajson/plugin.py | 148 +++++ .../datajson/templates/html_rendition.html | 43 ++ 7 files changed, 942 insertions(+), 154 deletions(-) create mode 100644 ckanext/datajson/templates/html_rendition.html diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index d4e472eb..27c93d8b 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -16,9 +16,9 @@ def get_facet_fields(): # Return fields that we'd like to add to default CKAN faceting. This really has # nothing to do with exporting data.json but it's probably a common consideration. facets = OrderedDict() - facets[ - # using "author" produces weird results because the Solr schema indexes it as "text" rather than "string" - "Agency"] = "Publishers" + + # using "author" produces weird results because the Solr schema indexes it as "text" rather than "string" + facets["Agency"] = "Publishers" # search facets remove spaces from field names facets["SubjectArea1"] = "Subjects" return facets @@ -50,12 +50,13 @@ def make_datajson_entry(package): ("language", extra(package, "Language")), ("PrimaryITInvestmentUII", extra(package, "PrimaryITInvestmentUII")), ("granularity", "/".join( - x for x in [extra(package, "Unit of Analysis"), extra(package, "Geographic Granularity")] if x != None)), + x for x in [extra(package, "Unit of Analysis"), extra(package, "Geographic Granularity")] if + x is not None)), ("dataQuality", extra(package, "Data Quality Met", default="true") == "true"), ("theme", [s for s in ( extra(package, "Subject Area 1"), extra(package, "Subject Area 2"), extra(package, "Subject Area 3")) if - s != None]), - ("references", [s for s in [extra(package, "Technical Documentation")] if s != None]), + s is not None]), + ("references", [s for s in [extra(package, "Technical Documentation")] if s is not None]), ("landingPage", package["url"]), ("systemOfRecords", extra(package, "System Of Records")), ("distribution", @@ -105,7 +106,6 @@ def get_api_resource(package): def build_temporal(package): # Build one dataset entry of the data.json file. - temporal = "" if extra(package, "Coverage Period Fiscal Year Start"): temporal = "FY" + extra(package, "Coverage Period Fiscal Year Start").replace(" ", "T").replace("T00:00:00", "") else: diff --git a/ckanext/datajson/harvester_base.py b/ckanext/datajson/harvester_base.py index 89d4ffd6..d1cf4de0 100644 --- a/ckanext/datajson/harvester_base.py +++ b/ckanext/datajson/harvester_base.py @@ -1,23 +1,40 @@ from ckan.lib.base import c from ckan import model +from ckan import plugins as p from ckan.model import Session, Package from ckan.logic import ValidationError, NotFound, get_action from ckan.lib.munge import munge_title_to_name from ckan.lib.search.index import PackageSearchIndex +from ckan.lib.navl.dictization_functions import Invalid +from ckan.lib.navl.validators import ignore_empty from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ - HarvestObjectError + HarvestObjectError, HarvestObjectExtra from ckanext.harvest.harvesters.base import HarvesterBase -import uuid, datetime, hashlib, urllib2, json, yaml +import uuid, datetime, hashlib, urllib2, json, yaml, json, os + +from jsonschema.validators import Draft4Validator +from jsonschema import FormatChecker import logging log = logging.getLogger("harvester") +VALIDATION_SCHEMA = [ + ('', 'Project Open Data (Federal)'), + ('non-federal', 'Project Open Data (Non-Federal)'), + ] + +def validate_schema(schema): + if schema not in [s[0] for s in VALIDATION_SCHEMA]: + raise Invalid('Unknown validation schema: {0}'.format(schema)) + return schema + class DatasetHarvesterBase(HarvesterBase): ''' A Harvester for datasets. ''' + _user_name = None # SUBCLASSES MUST IMPLEMENT #HARVESTER_VERSION = "1.0" @@ -34,13 +51,49 @@ def validate_config(self, config): config_obj = yaml.load(config) return config + def load_config(self, harvest_source): + # Load the harvest source's configuration data. We expect it to be a YAML + # string. Unfortunately I went ahead of CKAN on this. The stock CKAN harvester + # only allows JSON in the configuration box. My fork is necessary for this + # to work: https://github.com/joshdata/ckanext-harvest + + ret = { + "filters": { }, # map data.json field name to list of values one of which must be present + "defaults": { }, # map field name to value to supply as default if none exists, handled by the actual importer module, so the field names may be arbitrary + } + + source_config = yaml.load(harvest_source.config) + + try: + ret["filters"].update(source_config["filters"]) + except TypeError: + pass + except KeyError: + pass + + try: + ret["defaults"].update(source_config["defaults"]) + except TypeError: + pass + except KeyError: + pass + + return ret + + def _get_user_name(self): + if not self._user_name: + user = p.toolkit.get_action('get_site_user')({'model': model, 'ignore_auth': True}, {}) + self._user_name = user['name'] + + return self._user_name + def context(self): # Reusing the dict across calls to action methods can be dangerous, so # create a new dict every time we need it. # Setting validate to False is critical for getting the harvester plugin # to set extra fields on the package during indexing (see ckanext/harvest/plugin.py # line 99, https://github.com/okfn/ckanext-harvest/blob/master/ckanext/harvest/plugin.py#L99). - return { "user": "harvest", "ignore_auth": True, "validate": False } + return { "user": self._get_user_name(), "ignore_auth": True } # SUBCLASSES MUST IMPLEMENT def load_remote_catalog(self, harvest_job): @@ -49,6 +102,11 @@ def load_remote_catalog(self, harvest_job): # with a locally unique identifier string and a 'title' field. raise Exception("Not implemented") + def extra_schema(self): + return { + 'validator_schema': [ignore_empty, unicode, validate_schema], + } + def gather_stage(self, harvest_job): # The gather stage scans a remote resource (like a /data.json file) for # a list of datasets to import. @@ -56,32 +114,158 @@ def gather_stage(self, harvest_job): log.debug('In %s gather_stage (%s)' % (repr(self), harvest_job.source.url)) # Start gathering. - source = self.load_remote_catalog(harvest_job) - if len(source) == 0: return [] + try: + source_datasets, catalog_values = self.load_remote_catalog(harvest_job) + except ValueError as e: + self._save_gather_error("Error loading json content: %s." % (e), harvest_job) + return [] + + if len(source_datasets) == 0: return [] + + DATAJSON_SCHEMA = { + "https://project-open-data.cio.gov/v1.1/schema": '1.1', + } + + # schema version is default 1.0, or a valid one (1.1, ...) + schema_version = '1.0' + parent_identifiers = set() + child_identifiers = set() + catalog_extras = {} + if isinstance(catalog_values, dict): + schema_value = catalog_values.get('conformsTo', '') + if schema_value not in DATAJSON_SCHEMA.keys(): + self._save_gather_error('Error reading json schema value.' \ + ' The given value is %s.' % ('empty' if schema_value == '' + else schema_value), harvest_job) + return [] + schema_version = DATAJSON_SCHEMA.get(schema_value, '1.0') + + for dataset in source_datasets: + parent_identifier = dataset.get('isPartOf') + if parent_identifier: + parent_identifiers.add(parent_identifier) + child_identifiers.add(dataset.get('identifier')) + + # get a list of needed catalog values and put into hobj + catalog_fields = ['@context', '@id', 'conformsTo', 'describedBy'] + catalog_extras = dict(('catalog_'+k, v) + for (k, v) in catalog_values.iteritems() + if k in catalog_fields) # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_identifier, # which corresponds to the remote catalog's 'identifier' field. # Make a mapping so we know how to update existing records. + # Added: mark all existing parent datasets. existing_datasets = { } + existing_parents = { } for hobj in model.Session.query(HarvestObject).filter_by(source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue - sid = self.find_extra(pkg, "source_identifier") + sid = self.find_extra(pkg, "identifier") + is_parent = self.find_extra(pkg, "collection_metadata") if sid: existing_datasets[sid] = pkg + if is_parent and pkg.get("state") == "active": + existing_parents[sid] = pkg + + # which parents has been demoted to child level? + existing_parents_demoted = set( + identifier for identifier in existing_parents.keys() \ + if identifier not in parent_identifiers) + + # if there is any new parents, we will have to harvest parents + # first, mark the status in harvest_source config, which + # triggers a children harvest_job after parents job is finished. + source = harvest_job.source + source_config = json.loads(source.config or '{}') + # run status: None, or parents_run, or children_run? + run_status = source_config.get('datajson_collection') + if parent_identifiers: + for parent in parent_identifiers & child_identifiers: + self._save_gather_error("Collection identifier '%s' \ + cannot be isPartOf another collection." \ + % parent, harvest_job) + + new_parents = set(identifier for identifier in parent_identifiers \ + if identifier not in existing_parents.keys()) + if new_parents: + if not run_status: + # fresh start + run_status = 'parents_run' + source_config['datajson_collection'] = run_status + source.config = json.dumps(source_config) + source.save() + elif run_status == 'children_run': + # it means new parents are tried and failed. + # but skip some which have previously reported with + # parent_identifiers & child_identifiers + for parent in new_parents - \ + (parent_identifiers & child_identifiers): + self._save_gather_error("Collection identifier '%s' \ + not found. Records which are part of this \ + collection will not be harvested." \ + % parent, harvest_job) + else: + # run_status was parents_run, and did not finish. + # something wrong but not sure what happened. + # let's leave it as it is, let it run one more time. + pass + else: + # all parents are already in place. run it as usual. + run_status = None + elif run_status: + # need to clear run_status + run_status = None + source_config['datajson_collection'] = run_status + source.config = json.dumps(source_config) + source.save() # Create HarvestObjects for any records in the remote catalog. object_ids = [] seen_datasets = set() + unique_datasets = set() - for dataset in source: + filters = self.load_config(harvest_job.source)["filters"] + + for dataset in source_datasets: # Create a new HarvestObject for this dataset and save the # dataset metdata inside it for later. + + # Check the config's filters to see if we should import this dataset. + # For each filter, check that the value specified in the data.json file + # is among the permitted values in the filter specification. + matched_filters = True + for k, v in filters.items(): + if dataset.get(k) not in v: + matched_filters = False + if not matched_filters: + continue + + if parent_identifiers and new_parents \ + and dataset['identifier'] not in parent_identifiers \ + and dataset.get('isPartOf') in new_parents: + if run_status == 'parents_run': + # skip those whose parents still need to run. + continue + else: + # which is 'children_run'. + # error out since parents got issues. + self._save_gather_error( + "Record with identifier '%s': isPartOf '%s' points to \ + an erroneous record." % (dataset['identifier'], + dataset.get('isPartOf')), harvest_job) + continue + + # Some source contains duplicate identifiers. skip all except the first one + if dataset['identifier'] in unique_datasets: + self._save_gather_error("Duplicate entry ignored for identifier: '%s'." % (dataset['identifier']), harvest_job) + continue + unique_datasets.add(dataset['identifier']) # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the @@ -96,7 +280,8 @@ def gather_stage(self, harvest_job): # in the package so we can avoid updating datasets that # don't look like they've changed. if pkg.get("state") == "active" \ - and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, harvest_job.source): + and dataset['identifier'] not in existing_parents_demoted \ + and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, harvest_job.source, catalog_extras, schema_version): continue else: pkg_id = uuid.uuid4().hex @@ -104,9 +289,22 @@ def gather_stage(self, harvest_job): # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the remote catalog file. + extras = [HarvestObjectExtra( + key='schema_version', value=schema_version)] + if dataset['identifier'] in parent_identifiers: + extras.append(HarvestObjectExtra( + key='is_collection', value=True)) + elif dataset.get('isPartOf'): + parent_pkg_id = existing_parents[dataset.get('isPartOf')]['id'] + extras.append(HarvestObjectExtra( + key='collection_pkg_id', value=parent_pkg_id)) + for k, v in catalog_extras.iteritems(): + extras.append(HarvestObjectExtra(key=k, value=v)) + obj = HarvestObject( guid=pkg_id, job=harvest_job, + extras=extras, content=json.dumps(dataset, sort_keys=True)) # use sort_keys to preserve field order so hashes of this string are constant from run to run obj.save() object_ids.append(obj.id) @@ -116,9 +314,14 @@ def gather_stage(self, harvest_job): if upstreamid in seen_datasets: continue # was just updated if pkg.get("state") == "deleted": continue # already deleted pkg["state"] = "deleted" - pkg["name"] = self.make_package_name(pkg["title"], pkg["id"], True) # try to prevent name clash by giving it a "deleted-" name log.warn('deleting package %s (%s) because it is no longer in %s' % (pkg["name"], pkg["id"], harvest_job.source.url)) get_action('package_update')(self.context(), pkg) + obj = HarvestObject( + guid=pkg_id, + job=harvest_job, + ) + obj.save() + object_ids.append(obj.id) return object_ids @@ -128,29 +331,217 @@ def fetch_stage(self, harvest_object): return True # SUBCLASSES MUST IMPLEMENT - def set_dataset_info(self, pkg, dataset, dataset_defaults): + def set_dataset_info(self, pkg, dataset, dataset_defaults, schema_version): # Sets package metadata on 'pkg' using the remote catalog's metadata # in 'dataset' and default values as configured in 'dataset_defaults'. raise Exception("Not implemented.") + # validate dataset against POD schema + # use a local copy. + def _validate_dataset(self, validator_schema, schema_version, dataset): + if validator_schema == 'non-federal': + if schema_version == '1.1': + file_path = 'pod_schema/non-federal-v1.1/dataset-non-federal.json' + else: + file_path = 'pod_schema/non-federal/single_entry.json' + else: + if schema_version == '1.1': + file_path = 'pod_schema/federal-v1.1/dataset.json' + else: + file_path = 'pod_schema/single_entry.json' + + with open(os.path.join( + os.path.dirname(__file__), file_path)) as json_file: + schema = json.load(json_file) + + msg = ";" + errors = Draft4Validator(schema, format_checker=FormatChecker()).iter_errors(dataset) + count = 0 + for error in errors: + count += 1 + msg = msg + " ### ERROR #" + str(count) + ": " + self._validate_readable_msg(error) + "; " + msg = msg.strip("; ") + if msg: + id = "Identifier: " + (dataset.get("identifier") if dataset.get("identifier") else "Unknown") + title = "Title: " + (dataset.get("title") if dataset.get("title") else "Unknown") + msg = id + "; " + title + "; " + str(count) + " Error(s) Found. " + msg + "." + return msg + + # make ValidationError readable. + def _validate_readable_msg(self, e): + msg = e.message.replace("u'", "'") + elem = "" + try: + if e.schema_path[0] == 'properties': + elem = e.schema_path[1] + elem = "'" + elem + "':" + except: + pass + + return elem + msg + def import_stage(self, harvest_object): # The import stage actually creates the dataset. log.debug('In %s import_stage' % repr(self)) - # Get default values. - source_config = yaml.load(harvest_object.source.config) - dataset_defaults = None - try: - dataset_defaults = source_config["defaults"] - except TypeError: - pass - except KeyError: - pass - if not dataset_defaults: dataset_defaults = { } - - # Get the metadata that we stored in the HarvestObject's content field. + if(harvest_object.content == None): + return True + dataset = json.loads(harvest_object.content) + schema_version = '1.0' # default to '1.0' + is_collection = False + parent_pkg_id = '' + catalog_extras = {} + for extra in harvest_object.extras: + if extra.key == 'schema_version': + schema_version = extra.value + if extra.key == 'is_collection' and extra.value: + is_collection = True + if extra.key == 'collection_pkg_id' and extra.value: + parent_pkg_id = extra.value + if extra.key.startswith('catalog_'): + catalog_extras[extra.key] = extra.value + + # if this dataset is part of collection, we need to check if + # parent dataset exist or not. we dont support any hierarchy + # in this, so the check does not apply to those of is_collection + if parent_pkg_id and not is_collection: + parent_pkg = None + try: + parent_pkg = get_action('package_show')(self.context(), + { "id": parent_pkg_id }) + except: + pass + if not parent_pkg: + parent_check_message = "isPartOf identifer '%s' not found." \ + % dataset.get('isPartOf') + self._save_object_error(parent_check_message, harvest_object, + 'Import') + return None + + # Get default values. + dataset_defaults = self.load_config(harvest_object.source)["defaults"] + + source_config = json.loads(harvest_object.source.config or '{}') + validator_schema = source_config.get('validator_schema') + if schema_version == '1.0' and validator_schema != 'non-federal': + lowercase_conversion = True + else: + lowercase_conversion = False + + MAPPING = { + "title": "title", + "description": "notes", + "keyword": "tags", + "modified": "extras__modified", # ! revision_timestamp + "publisher": "extras__publisher", # !owner_org + "contactPoint": "maintainer", + "mbox": "maintainer_email", + "identifier": "extras__identifier", # !id + "accessLevel": "extras__accessLevel", + + "bureauCode": "extras__bureauCode", + "programCode": "extras__programCode", + "accessLevelComment": "extras__accessLevelComment", + "license": "extras__license", # !license_id + "spatial": "extras__spatial", # Geometry not valid GeoJSON, not indexing + "temporal": "extras__temporal", + + "theme": "extras__theme", + "dataDictionary": "extras__dataDictionary", # !data_dict + "dataQuality": "extras__dataQuality", + "accrualPeriodicity":"extras__accrualPeriodicity", + "landingPage": "extras__landingPage", + "language": "extras__language", + "primaryITInvestmentUII": "extras__primaryITInvestmentUII", # !PrimaryITInvestmentUII + "references": "extras__references", + "issued": "extras__issued", + "systemOfRecords": "extras__systemOfRecords", + + "accessURL": None, + "webService": None, + "format": None, + "distribution": None, + } + + MAPPING_V1_1 = { + "title": "title", + "description": "notes", + "keyword": "tags", + "modified": "extras__modified", # ! revision_timestamp + "publisher": "extras__publisher", # !owner_org + "contactPoint": {"fn":"maintainer", "hasEmail":"maintainer_email"}, + "identifier": "extras__identifier", # !id + "accessLevel": "extras__accessLevel", + + "bureauCode": "extras__bureauCode", + "programCode": "extras__programCode", + "rights": "extras__rights", + "license": "extras__license", # !license_id + "spatial": "extras__spatial", # Geometry not valid GeoJSON, not indexing + "temporal": "extras__temporal", + + "theme": "extras__theme", + "dataDictionary": "extras__dataDictionary", # !data_dict + "dataQuality": "extras__dataQuality", + "accrualPeriodicity":"extras__accrualPeriodicity", + "landingPage": "extras__landingPage", + "language": "extras__language", + "primaryITInvestmentUII": "extras__primaryITInvestmentUII", # !PrimaryITInvestmentUII + "references": "extras__references", + "issued": "extras__issued", + "systemOfRecords": "extras__systemOfRecords", + + "distribution": None, + } + + SKIP = ["accessURL", "webService", "format", "distribution"] # will go into pkg["resources"] + # also skip the processed_how key, it was added to indicate how we processed the dataset. + SKIP.append("processed_how"); + + SKIP_V1_1 = ["@type", "isPartOf", "distribution"] + SKIP_V1_1.append("processed_how"); + + if lowercase_conversion: + + mapping_processed = {} + for k,v in MAPPING.items(): + mapping_processed[k.lower()] = v + + skip_processed = [k.lower() for k in SKIP] + + dataset_processed = {'processed_how': ['lowercase']} + for k,v in dataset.items(): + if k.lower() in mapping_processed.keys(): + dataset_processed[k.lower()] = v + else: + dataset_processed[k] = v + + if 'distribution' in dataset and dataset['distribution'] is not None: + dataset_processed['distribution'] = [] + for d in dataset['distribution']: + d_lower = {} + for k,v in d.items(): + if k.lower() in mapping_processed.keys(): + d_lower[k.lower()] = v + else: + d_lower[k] = v + dataset_processed['distribution'].append(d_lower) + else: + dataset_processed = dataset + mapping_processed = MAPPING + skip_processed = SKIP + + if schema_version == '1.1': + mapping_processed = MAPPING_V1_1 + skip_processed = SKIP_V1_1 + + validate_message = self._validate_dataset(validator_schema, + schema_version, dataset_processed) + if validate_message: + self._save_object_error(validate_message, harvest_object, 'Import') + return None # We need to get the owner organization (if any) from the harvest # source dataset @@ -158,40 +549,111 @@ def import_stage(self, harvest_object): source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: owner_org = source_dataset.owner_org - + + + source_config = json.loads(harvest_object.source.config or '{}') + group_name = source_config.get('default_groups', '') + # Assemble basic information about the dataset. + pkg = { - "name": self.make_package_name(dataset["title"], harvest_object.guid, False), "state": "active", # in case was previously deleted "owner_org": owner_org, - "extras": [{ - "key": "source_url", - "value": harvest_object.source.url, + "groups": [{"name": group_name}], + "resources": [], + "extras": [ + { + "key": "resource-type", + "value": "Dataset", }, { - "key": "source_title", - "value": harvest_object.source.title, + "key": "source_hash", + "value": self.make_upstream_content_hash(dataset, harvest_object.source, catalog_extras, schema_version), }, { - "key": "source_identifier", - "value": dataset["identifier"], + "key": "source_datajson_identifier", + "value": True, }, { - "key": "source_hash", - "value": self.make_upstream_content_hash(dataset, harvest_object.source), + "key": "harvest_source_id", + "value": harvest_object.harvest_source_id, }, { - "key": "harvest_harvester_version", - "value": self.HARVESTER_VERSION, + "key": "harvest_object_id", + "value": harvest_object.id, }, { - "key": "harvest_last_updated", - "value": datetime.datetime.utcnow().isoformat(), - }] + "key": "harvest_source_title", + "value": harvest_object.source.title, + }, + { + "key": "source_schema_version", + "value": schema_version, + }, + ] } - + + extras = pkg["extras"] + unmapped = [] + + for key, value in dataset_processed.iteritems(): + if key in skip_processed: + continue + new_key = mapping_processed.get(key) + if not new_key: + unmapped.append(key) + continue + + # after schema 1.0+, we need to deal with multiple new_keys + new_keys = [] + values = [] + if isinstance(new_key, dict): # when schema is not 1.0 + _new_key_keys = new_key.keys() + new_keys = new_key.values() + values = [] + for _key in _new_key_keys: + values.append(value.get(_key)) + else: + new_keys.append(new_key) + values.append(value) + + if not any(item for item in values): + continue + + mini_dataset = dict(zip(new_keys, values)) + for mini_key, mini_value in mini_dataset.iteritems(): + if not mini_value: + continue + if mini_key.startswith('extras__'): + extras.append({"key": mini_key[8:], "value": mini_value}) + else: + pkg[mini_key] = mini_value + + # pick a fix number of unmapped entries and put into extra + if unmapped: + unmapped.sort() + del unmapped[100:] + for key in unmapped: + value = dataset_processed.get(key, "") + if value is not None: extras.append({"key": key, "value": value}) + + # if theme is geospatial/Geospatial, we tag it in metadata_type. + themes = self.find_extra(pkg, "theme") + if themes and ('geospatial' in [x.lower() for x in themes]): + extras.append({'key':'metadata_type', 'value':'geospatial'}) + + if is_collection: + extras.append({'key':'collection_metadata', 'value':'true'}) + elif parent_pkg_id: + extras.append( + {'key':'collection_package_id', 'value':parent_pkg_id} + ) + + for k, v in catalog_extras.iteritems(): + extras.append({'key':k, 'value':v}) + # Set specific information about the dataset. - self.set_dataset_info(pkg, dataset, dataset_defaults) + self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version) # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. @@ -209,7 +671,7 @@ def import_stage(self, harvest_object): for existing_res in existing_pkg.get("resources", []): if res["url"] == existing_res["url"]: res["id"] = existing_res["id"] - + pkg['groups'] = existing_pkg['groups'] existing_pkg.update(pkg) # preserve other fields that we're not setting, but clobber extras pkg = existing_pkg @@ -217,6 +679,7 @@ def import_stage(self, harvest_object): pkg = get_action('package_update')(self.context(), pkg) else: # It doesn't exist yet. Create a new one. + pkg['name'] = self.make_package_name(dataset_processed["title"], harvest_object.guid) try: pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) @@ -243,9 +706,16 @@ def import_stage(self, harvest_object): return True - def make_upstream_content_hash(self, datasetdict, harvest_source): - return hashlib.sha1(json.dumps(datasetdict, sort_keys=True) - + "|" + harvest_source.config + "|" + self.HARVESTER_VERSION).hexdigest() + def make_upstream_content_hash(self, datasetdict, harvest_source, + catalog_extras, schema_version='1.0'): + if schema_version == '1.0': + return hashlib.sha1(json.dumps(datasetdict, sort_keys=True) + + "|" + harvest_source.config + "|" + + self.HARVESTER_VERSION).hexdigest() + else: + return hashlib.sha1(json.dumps(datasetdict, sort_keys=True) + + "|" + json.dumps(catalog_extras, + sort_keys=True)).hexdigest() def find_extra(self, pkg, key): for extra in pkg["extras"]: @@ -253,7 +723,7 @@ def find_extra(self, pkg, key): return extra["value"] return None - def make_package_name(self, title, exclude_existing_package, for_deletion): + def make_package_name(self, title, exclude_existing_package): ''' Creates a URL friendly name from a title @@ -261,13 +731,29 @@ def make_package_name(self, title, exclude_existing_package, for_deletion): ''' name = munge_title_to_name(title).replace('_', '-') - if for_deletion: name = "deleted-" + name while '--' in name: name = name.replace('--', '-') name = name[0:90] # max length is 100 + + # Is this slug already in use (and if we're updating a package, is it in + # use by a different package?). pkg_obj = Session.query(Package).filter(Package.name == name).filter(Package.id != exclude_existing_package).first() - if pkg_obj: - return name + "-" + str(uuid.uuid4())[:5] - else: + if not pkg_obj: + # The name is available, so use it. Note that if we're updating an + # existing package we will be updating this package's URL, so incoming + # links may break. return name - + + if exclude_existing_package: + # The name is not available, and we're updating a package. Chances + # are the package's name already had some random string attached + # to it last time. Prevent spurrious updates to the package's URL + # (choosing new random text) by just reusing the existing package's + # name. + pkg_obj = Session.query(Package).filter(Package.id == exclude_existing_package).first() + if pkg_obj: # the package may not exist yet because we may be passed the desired package GUID before a new package is instantiated + return pkg_obj.name + + # Append some random text to the URL. Hope that with five character + # there will be no collsion. + return name + "-" + str(uuid.uuid4())[:5] diff --git a/ckanext/datajson/harvester_cmsdatanavigator.py b/ckanext/datajson/harvester_cmsdatanavigator.py index 8ae0bef6..f364683d 100644 --- a/ckanext/datajson/harvester_cmsdatanavigator.py +++ b/ckanext/datajson/harvester_cmsdatanavigator.py @@ -7,7 +7,7 @@ class CmsDataNavigatorHarvester(DatasetHarvesterBase): A Harvester for the CMS Data Navigator catalog. ''' - HARVESTER_VERSION = "0.9aj" # increment to force an update even if nothing has changed + HARVESTER_VERSION = "0.9al" # increment to force an update even if nothing has changed def info(self): return { @@ -27,23 +27,26 @@ def set_dataset_info(self, package, dataset, dataset_defaults): extra(package, "Agency", "Department of Health & Human Services") package["author"] = "Centers for Medicare & Medicaid Services" extra(package, "author_id", "http://healthdata.gov/id/agency/cms") + extra(package, "Bureau Code", "009:38") package["title"] = dataset["Name"].strip() package["notes"] = dataset.get("Description") package["url"] = dataset.get("Address") - extra(package, "Date Released", parsedate(dataset["HealthData"].get("DateReleased"))) - extra(package, "Date Updated", parsedate(dataset["HealthData"].get("DateUpdated"))) - extra(package, "Agency Program URL", dataset["HealthData"].get("AgencyProgramURL")) + + dataset_hd = dataset["HealthData"] + extra(package, "Date Released", parsedate(dataset_hd.get("DateReleased"))) + extra(package, "Date Updated", parsedate(dataset_hd.get("DateUpdated"))) + extra(package, "Agency Program URL", dataset_hd.get("AgencyProgramURL")) extra(package, "Subject Area 1", "Medicare") - extra(package, "Unit of Analysis", dataset["HealthData"].get("UnitOfAnalysis")) - extra(package, "Data Dictionary", dataset["HealthData"].get("DataDictionaryURL")) - extra(package, "Coverage Period", dataset["HealthData"].get("Coverage Period")) - extra(package, "Collection Frequency", dataset["HealthData"].get("Collection Frequency")) - extra(package, "Geographic Scope", dataset["HealthData"].get("GeographicScope")) - #extra(package, "Contact Person", dataset["HealthData"].get("ContactName")) # not in HHS schema - #extra(package, "Contact Email", dataset["HealthData"].get("ContactEmail")) # not in HHS schema - extra(package, "License Agreement", dataset["HealthData"].get("DataLicenseAgreementURL")) - + extra(package, "Unit of Analysis", dataset_hd.get("UnitOfAnalysis")) + extra(package, "Data Dictionary", dataset_hd.get("DataDictionaryURL")) + extra(package, "Coverage Period", dataset_hd.get("Coverage Period")) + extra(package, "Collection Frequency", dataset_hd.get("Collection Frequency")) + extra(package, "Geographic Scope", dataset_hd.get("GeographicScope")) + extra(package, "Contact Name", dataset_hd.get("GenericContactName", None) or dataset_hd.get("ContactName")) # 'X or Y' syntax returns Y if X is either None or the empty string + extra(package, "Contact Email", dataset_hd.get("GenericContactEmail", None) or dataset_hd.get("ContactEmail")) + extra(package, "License Agreement", dataset_hd.get("DataLicenseAgreementURL")) + from ckan.lib.munge import munge_title_to_name package["tags"] = [ { "name": munge_title_to_name(t["Name"]) } for t in dataset.get("Keywords", [])] diff --git a/ckanext/datajson/harvester_datajson.py b/ckanext/datajson/harvester_datajson.py index c72b3c90..67891c01 100644 --- a/ckanext/datajson/harvester_datajson.py +++ b/ckanext/datajson/harvester_datajson.py @@ -1,4 +1,6 @@ from ckanext.datajson.harvester_base import DatasetHarvesterBase +from parse_datajson import parse_datajson_entry + import urllib2, json @@ -7,7 +9,7 @@ class DataJsonHarvester(DatasetHarvesterBase): A Harvester for /data.json files. ''' - HARVESTER_VERSION = "0.9aj" # increment to force an update even if nothing has changed + HARVESTER_VERSION = "0.9al" # increment to force an update even if nothing has changed def info(self): return { @@ -17,10 +19,47 @@ def info(self): } def load_remote_catalog(self, harvest_job): - return json.load(urllib2.urlopen(harvest_job.source.url)) + req = urllib2.Request(harvest_job.source.url) + # todo: into config and across harvester + req.add_header('User-agent', 'Data.gov/2.0') + try: + datasets = json.load(urllib2.urlopen(req)) + except UnicodeDecodeError: + # try different encode + try: + datasets = json.load(urllib2.urlopen(req), 'cp1252') + except: + datasets = json.load(urllib2.urlopen(req), 'iso-8859-1') + except: + # remove BOM + datasets = json.loads(lstrip_bom(urllib2.urlopen(req).read())) + + # The first dataset should be for the data.json file itself. Check that + # it is, and if so rewrite the dataset's title because Socrata exports + # these items all with the same generic name that is confusing when + # harvesting a bunch from different sources. It should have an accessURL + # but Socrata fills the URL of these in under webService. + if isinstance(datasets, list) and len(datasets) > 0 and (datasets[0].get("accessURL") == harvest_job.source.url + or datasets[0].get("webService") == harvest_job.source.url) and \ + datasets[0].get("title") == "Project Open Data, /data.json file": + datasets[0]["title"] = "%s Project Open Data data.json File" % harvest_job.source.title + + catalog_values = None + if isinstance(datasets, dict): + # this is a catalog, not dataset array as in schema 1.0. + catalog_values = datasets.copy() + datasets = catalog_values.pop("dataset", []) + + return (datasets, catalog_values) - def set_dataset_info(self, pkg, dataset, dataset_defaults): - from parse_datajson import parse_datajson_entry - parse_datajson_entry(dataset, pkg, dataset_defaults) - + def set_dataset_info(self, pkg, dataset, dataset_defaults, schema_version): + parse_datajson_entry(dataset, pkg, dataset_defaults, schema_version) +# helper function to remove BOM +def lstrip_bom(str_): + from codecs import BOM_UTF8 + bom = BOM_UTF8 + if str_.startswith(bom): + return str_[len(bom):] + else: + return str_ diff --git a/ckanext/datajson/parse_datajson.py b/ckanext/datajson/parse_datajson.py index e624096c..493cd6de 100644 --- a/ckanext/datajson/parse_datajson.py +++ b/ckanext/datajson/parse_datajson.py @@ -1,81 +1,150 @@ +from ckan.lib.munge import munge_title_to_name + import re -def parse_datajson_entry(datajson, package, defaults): - package["title"] = datajson.get("title", defaults.get("Title")) - package["notes"] = datajson.get("description", defaults.get("Notes")) - package["tags"] = [ { "name": t } for t in - datajson.get("keyword", defaults.get("Tags", "")).split(",") if t.strip() != ""] - package["groups"] = [ { "name": g } for g in - defaults.get("Groups", [])] # the complexity of permissions makes this useless, CKAN seems to ignore - package["organization"] = datajson.get("organization", defaults.get("Organization")) - extra(package, "Group Name", defaults.get("Group Name")) # i.e. dataset grouping string - extra(package, "Date Updated", datajson.get("modified")) - extra(package, "Agency", defaults.get("Agency")) # i.e. federal department - package["publisher"] = datajson.get("publisher", defaults.get("Author")) # i.e. agency within HHS - extra(package, "author_id", defaults.get("author_id")) # i.e. URI for agency - extra(package, "Agency Program URL", defaults.get("Agency Program URL")) # i.e. URL for agency program - extra(package, "Contact Person", datajson.get("person")) # not in HHS schema - extra(package, "Contact Email", datajson.get("mbox")) # not in HHS schema - # "identifier" is handled by the harvester - extra(package, "Access Level", datajson.get("accessLevel")) # not in HHS schema - extra(package, "Data Dictionary", datajson.get("dataDictionary", defaults.get("Data Dictionary"))) - # accessURL is redundant with resources - # webService is redundant with resources - extra(package, "Format", datajson.get("format")) # not in HHS schema - extra(package, "License Agreement", datajson.get("license")) - #extra(package, "License Agreement Required", ...) - extra(package, "Geographic Scope", datajson.get("spatial")) - extra(package, "Temporal", datajson.get("temporal")) # HHS uses Coverage Period (FY) Start/End - extra(package, "Date Released", datajson.get("issued")) - #extra(package, "Collection Frequency", ...) - extra(package, "Publish Frequency", datajson.get("accrualPeriodicity")) # not in HHS schema - extra(package, "Language", datajson.get("language")) # not in HHS schema - extra(package, "Granularity", datajson.get("granularity")) # not in HHS schema - extra(package, "Data Quality Met", datajson.get("dataQuality")) # not in HHS schema - #extra(package, "Unit of Analysis", ...) - #extra(package, "Collection Instrument", ...) - extra(package, "Subject Area 1", datajson.get("theme", defaults.get("Subject Area 1"))) - extra(package, "Subject Area 2", defaults.get("Subject Area 2")) - extra(package, "Subject Area 2", defaults.get("Subject Area 3")) - extra(package, "Technical Documentation", datajson.get("references")) - extra(package, "Size", datajson.get("size")) # not in HHS schema - package["url"] = datajson.get("landingPage", datajson.get("webService", datajson.get("accessURL"))) - extra(package, "Feed", datajson.get("feed")) # not in HHS schema - extra(package, "System Of Records", datajson.get("systemOfRecords")) # not in HHS schema - package["resources"] = [ ] - for d in datajson.get("distribution", []): - for k in ("accessURL", "webService"): - if d.get(k, "").strip() != "": - r = { - "url": d[k], - "format": normalize_format(d.get("format", "Query Tool" if k == "webService" else "Unknown")), - } - extra(r, "Language", d.get("language")) - extra(r, "Size", d.get("size")) - - # work-around for Socrata-style formats array - try: - r["format"] = normalize_format(d["formats"][0]["label"]) - except: - pass - - r["name"] = r["format"] - - package["resources"].append(r) - +def parse_datajson_entry(datajson, package, defaults, schema_version): + # four fields need extra handling, which are + # 1.tag, 2.license, 3.maintainer_email, 4.publisher_hierarchy, + # 5.resources + + # 1. package["tags"] + package["tags"] = [ { "name": munge_title_to_name(t) } for t in + package.get("tags", "") if t.strip() != ""] + + # 2. package["license"] + licenses = { + 'Creative Commons Attribution':'cc-by', + 'Creative Commons Attribution Share-Alike':'cc-by-sa', + 'Creative Commons CCZero':'cc-zero', + 'Creative Commons Non-Commercial (Any)':'cc-nc', + 'GNU Free Documentation License':'gfdl', + 'License Not Specified':'notspecified', + 'Open Data Commons Attribution License':'odc-by', + 'Open Data Commons Open Database License (ODbL)':'odc-odbl', + 'Open Data Commons Public Domain Dedication and License (PDDL)':'odc-pddl', + 'Other (Attribution)':'other-at', + 'Other (Non-Commercial)':'other-nc', + 'Other (Not Open)':'other-closed', + 'Other (Open)':'other-open', + 'Other (Public Domain)':'other-pd', + 'UK Open Government Licence (OGL)':'uk-ogl', + } + + if not datajson.get("license", ""): + package["license_id"] = licenses.get("License Not Specified", ""); + elif licenses.get(datajson.get("license", ""), ""): + package["license_id"] = licenses.get(datajson.get("license", ""), "") + + # 3. package["maintainer_email"] + if package.get("maintainer_email"): + package["maintainer_email"] = \ + package.get("maintainer_email").replace("mailto:", "", 1) + + # 4. extras-publisher and extras-publisher_hierarchy + if schema_version == '1.1': + publisher = find_extra(package, "publisher", {}) + publisher_name = publisher.get("name", "") + set_extra(package, "publisher", publisher_name) + parent_publisher = publisher.get("subOrganizationOf", {}) + publisher_hierarchy = [] + while parent_publisher: + parent_name = parent_publisher.get("name", "") + parent_publisher = parent_publisher.get("subOrganizationOf", {}) + publisher_hierarchy.append(parent_name) + if publisher_hierarchy: + publisher_hierarchy.reverse() + publisher_hierarchy.append(publisher_name) + publisher_hierarchy = " > ".join(publisher_hierarchy) + set_extra(package, "publisher_hierarchy", publisher_hierarchy) + + # 5. package["resources"] + # if distribution is empty, assemble it with root level accessURL and format. + # but firstly it can be an ill-formated dict. + distribution = datajson.get("distribution", []) + if isinstance(distribution, dict): distribution = [distribution] + if not isinstance(distribution, list): distribution = [] + + downloadurl_key = "downloadURL" + acccessurl_key = "accessURL" + webservice_key = "webService" + if datajson.get("processed_how", []) and "lowercase" in datajson.get("processed_how", []): + acccessurl_key = acccessurl_key.lower() + webservice_key = webservice_key.lower() + + if not distribution: + for url in (acccessurl_key, webservice_key): + if datajson.get(url, "") and datajson.get(url, "").strip(): + d = { + url: datajson.get(url, ""), + "format": datajson.get("format", ""), + "mimetype": datajson.get("format", ""), + } + distribution.append(d) + + datajson["distribution"] = distribution + + for d in datajson.get("distribution", []): + downloadurl_value = d.get(downloadurl_key, "").strip() + accessurl_value = d.get(acccessurl_key, "").strip() + webservice_value = d.get(webservice_key, "").strip() + + which_value = (accessurl_value or webservice_value) if schema_version == '1.0' else (downloadurl_value or accessurl_value) + + if which_value: + r = {} + r['url'] = which_value + r['format'] = d.get("format", "") if schema_version == '1.0' else d.get("format", d.get("mediaType", "")) + r['mimetype'] = d.get("format", "") if schema_version == '1.0' else d.get("mediaType", "") + r['description'] = d.get('description', '') + r['name'] = d.get('title', '') + + # after schema 1.1+, we have some extra fields for resource + resource_extras = ['conformsTo', 'describedBy', 'describedByType'] + for resource_extra_key in resource_extras: + resource_extra_value = d.get(resource_extra_key) + if resource_extra_value: + r[resource_extra_key] = resource_extra_value + + # after schema 1.1+, include acccessurl if it is left over + if downloadurl_value and accessurl_value: + r['accessURL'] = accessurl_value + + package["resources"].append(r) + def extra(package, key, value): - if not value: return - package.setdefault("extras", []).append({ "key": key, "value": value }) - -def normalize_format(format): - # Format should be a file extension. But sometimes Socrata outputs a MIME type. - format = format.lower() - m = re.match(r"((application|text)/(\S+))(; charset=.*)?", format) - if m: - if m.group(1) == "text/plain": return "Text" - if m.group(1) == "application/zip": return "ZIP" - if m.group(1) == "application/vnd.ms-excel": return "XLS" - if m.group(1) == "application/x-msaccess": return "Access" - return "Other" - if format == "text": return "Text" - return format.upper() # hope it's one of our formats by converting to upprecase + if not value: return + package.setdefault("extras", []).append({ "key": key, "value": value }) + +def find_extra(pkg, key, default): + for extra in pkg["extras"]: + if extra["key"] == key: + ret = extra["value"] + break + else: + ret = default + + return ret + +def set_extra(pkg, key, value): + for extra in pkg["extras"]: + if extra["key"] == key: + extra["value"] = value + break + else: + pkg["extras"].append({"key":key, "value":value}) + +def normalize_format(format, raise_on_unknown=False): + if format is None: return + # Format should be a file extension. But sometimes Socrata outputs a MIME type. + format = format.lower() + m = re.match(r"((application|text)/(\S+))(; charset=.*)?", format) + if m: + if m.group(1) == "text/plain": return "Text" + if m.group(1) == "application/zip": return "ZIP" + if m.group(1) == "application/vnd.ms-excel": return "XLS" + if m.group(1) == "application/x-msaccess": return "Access" + if raise_on_unknown: raise ValueError() # caught & ignored by caller + return "Other" + if format == "text": return "Text" + if raise_on_unknown and "?" in format: raise ValueError() # weird value we should try to filter out; exception is caught & ignored by caller + return format.upper() # hope it's one of our formats by converting to upprecase diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 1fd95b3f..b86a8ebf 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -29,6 +29,154 @@ class DataJsonPlugin(p.SingletonPlugin): p.implements(p.interfaces.IConfigurer) p.implements(p.interfaces.IRoutes, inherit=True) + p.implements(p.interfaces.IFacets) + + # IConfigurer + + def update_config(self, config): + # Must use IConfigurer rather than IConfigurable because only IConfigurer + # is called before after_map, in which we need the configuration directives + # to know how to set the paths. + DataJsonPlugin.route_path = config.get("ckanext.datajson.path", "/data.json") + DataJsonPlugin.route_ld_path = config.get("ckanext.datajsonld.path", + re.sub(r"\.json$", ".jsonld", DataJsonPlugin.route_path)) + DataJsonPlugin.ld_id = config.get("ckanext.datajsonld.id", config.get("ckan.site_url")) + DataJsonPlugin.ld_title = config.get("ckan.site_title", "Catalog") + DataJsonPlugin.site_url = config.get("ckan.site_url") + + # Adds our local templates directory. It's smart. It knows it's + # relative to the path of *this* file. Wow. + p.toolkit.add_template_directory(config, "templates") + + # IRoutes + + def before_map(self, m): + return m + + def after_map(self, m): + # /data.json and /data.jsonld (or other path as configured by user) + m.connect('datajson', DataJsonPlugin.route_path, controller='ckanext.datajson.plugin:DataJsonController', + action='generate_json') + m.connect('datajsonld', DataJsonPlugin.route_ld_path, controller='ckanext.datajson.plugin:DataJsonController', + action='generate_jsonld') + + # /pod/validate + m.connect('datajsonvalidator', "/pod/validate", controller='ckanext.datajson.plugin:DataJsonController', + action='validator') + + # /pod/data-listing + m.connect('datajsonhtml', "/pod/data-catalog", controller='ckanext.datajson.plugin:DataJsonController', + action='show_html_rendition') + + return m + + # IFacets + + def dataset_facets(self, facets, package_type): + # Add any facets specified in build_datajson.get_facet_fields() to the top + # of the facet list, and then put the CKAN default facets below that. + f = OrderedDict() + f.update(get_facet_fields()) + f.update(facets) + return f + + def group_facets(self, facets_dict, group_type, package_type): + return facets_dict + + def organization_facets(self, facets_dict, organization_type, package_type): + return facets_dict + + +class DataJsonController(BaseController): + def generate_output(self, format): + # set content type (charset required or pylons throws an error) + response.content_type = 'application/json; charset=UTF-8' + + # allow caching of response (e.g. by Apache) + del response.headers["Cache-Control"] + del response.headers["Pragma"] + + # output + data = self.make_json() + + if format == 'json-ld': + # Convert this to JSON-LD. + data = OrderedDict([ + ("@context", OrderedDict([ + ("rdfs", "http://www.w3.org/2000/01/rdf-schema#"), + ("dcterms", "http://purl.org/dc/terms/"), + ("dcat", "http://www.w3.org/ns/dcat#"), + ("foaf", "http://xmlns.com/foaf/0.1/"), + ("pod", "http://project-open-data.github.io/schema/2013-09-20_1.0#"), + ]) + ), + ("@id", DataJsonPlugin.ld_id), + ("@type", "dcat:Catalog"), + ("dcterms:title", DataJsonPlugin.ld_title), + ("rdfs:label", DataJsonPlugin.ld_title), + ("foaf:homepage", DataJsonPlugin.site_url), + ("dcat:dataset", [dataset_to_jsonld(d) for d in data]), + ]) + + return p.toolkit.literal(json.dumps(data, indent=2)) + + def make_json(self): + # Build the data.json file. + packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) + return [make_datajson_entry(pkg) for pkg in packages if pkg["type"] == "dataset"] + + def generate_json(self): + return self.generate_output('json') + + def generate_jsonld(self): + return self.generate_output('json-ld') + + def validator(self): + # Validates that a URL is a good data.json file. + if request.method == "POST" and "url" in request.POST and request.POST["url"].strip() != "": + c.source_url = request.POST["url"] + c.errors = [] + + import urllib, json + from datajsonvalidator import do_validation + + body = None + try: + body = json.load(urllib.urlopen(c.source_url)) + except IOError as e: + c.errors.append(("Error Loading File", ["The address could not be loaded: " + unicode(e)])) + except ValueError as e: + c.errors.append(("Invalid JSON", ["The file does not meet basic JSON syntax requirements: " + unicode( + e) + ". Try using JSONLint.com."])) + except Exception as e: + c.errors.append(( + "Internal Error", ["Something bad happened while trying to load and parse the file: " + unicode(e)])) + + if body: + try: + do_validation(body, c.source_url, c.errors) + except Exception as e: + c.errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) + if len(c.errors) == 0: + c.errors.append(("No Errors", ["Great job!"])) + + return render('datajsonvalidator.html') + + def show_html_rendition(self): + # Shows an HTML rendition of the data.json file. Requests the file live + # from http://localhost/data.json. + + import urllib, json + + try: + c.catalog_data = json.load(urllib.urlopen("http://localhost/data.json")) + except: + c.catalog_data = [] + + c.catalog_data.sort(key=lambda x: x.get("modified"), reverse=True) + + return render('html_rendition.html') + class JsonExportPlugin(p.SingletonPlugin): diff --git a/ckanext/datajson/templates/html_rendition.html b/ckanext/datajson/templates/html_rendition.html new file mode 100644 index 00000000..96ff1ee7 --- /dev/null +++ b/ckanext/datajson/templates/html_rendition.html @@ -0,0 +1,43 @@ +{% extends "page.html" %} + +{% block subtitle %}Data Catalog (HTML Table Rendition){% endblock %} + +{% block breadcrumb_content %} +{% endblock %} + +{% block primary %} +
+
+

Data Catalog

+ +

Welcome to the {{g.site_title}}. There are several ways you may view & download the data catalog:

+ + + +
+ + + {% for item in c.catalog_data %} + + + + {% endfor %} +
+

{{item.title}}

+
+

{{item.description}}

+

+ {% if item.accessURL %}{{item.accessURL}}
{% endif %} + Last Modified: {% if item.modified %}{{item.modified}}{% else %}unknown{% endif %}
+

+
+
+
+
+{% endblock %} + +{% block secondary %}{% endblock %} From 8a915a4124be1c698b93fcd300f5f6d5b927b997 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 13 Mar 2015 13:06:33 -0400 Subject: [PATCH 08/22] Schema updated to latest POD changes / master branch --- ckanext/datajson/parse_datajson.py | 2 +- .../pod_schema/federal-v1.1/dataset.json | 163 +++- .../non-federal-v1.1/dataset-non-federal.json | 12 +- .../pod_schema/non-federal/single_entry.json | 844 ++++++++--------- ckanext/datajson/pod_schema/single_entry.json | 855 +++++++++--------- 5 files changed, 1032 insertions(+), 844 deletions(-) diff --git a/ckanext/datajson/parse_datajson.py b/ckanext/datajson/parse_datajson.py index 493cd6de..63c6f5f3 100644 --- a/ckanext/datajson/parse_datajson.py +++ b/ckanext/datajson/parse_datajson.py @@ -147,4 +147,4 @@ def normalize_format(format, raise_on_unknown=False): return "Other" if format == "text": return "Text" if raise_on_unknown and "?" in format: raise ValueError() # weird value we should try to filter out; exception is caught & ignored by caller - return format.upper() # hope it's one of our formats by converting to upprecase + return format.upper() # hope it's one of our formats by converting to upprecase \ No newline at end of file diff --git a/ckanext/datajson/pod_schema/federal-v1.1/dataset.json b/ckanext/datajson/pod_schema/federal-v1.1/dataset.json index b9037fb8..21b09dbe 100644 --- a/ckanext/datajson/pod_schema/federal-v1.1/dataset.json +++ b/ckanext/datajson/pod_schema/federal-v1.1/dataset.json @@ -62,6 +62,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -89,6 +93,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -102,6 +110,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -115,6 +127,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -127,6 +143,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -143,13 +163,25 @@ { "type": "array", "items": { - "$ref": "#/definitions/distribution", - "minItems": 1, - "uniqueItems": true + "anyOf": [ + { + "$ref": "#/definitions/distribution", + "minItems": 1, + "uniqueItems": true + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] } }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -169,18 +201,30 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, "keyword": { "title": "Tags", "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1 + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] }, "landingPage": { "title": "Homepage URL", @@ -192,6 +236,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -208,6 +256,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -221,6 +273,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -239,6 +295,10 @@ { "type": "string", "pattern": "^(R\\d*\\/)?([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -252,6 +312,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -276,14 +340,26 @@ { "type": "array", "items": { - "type": "string", - "format": "uri" + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] }, "minItems": 1, "uniqueItems": true }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -331,6 +407,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -341,6 +421,9 @@ { "type": "string", "minLength": 1 + }, + { + "type": "null" } ] }, @@ -359,13 +442,17 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, "title": { "title": "Title", "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", - "type": "string", + "type": "string", "minLength": 1 } }, @@ -387,18 +474,26 @@ "enum": [ "vcard:Contact" ] - }, + }, "fn": { "title": "Contact Name", "description": "A full formatted name, eg Firstname Lastname", "type": "string", - "minLength": 1 + "minLength": 1 }, "hasEmail": { "title": "Email", "description": "Email address for the contact", - "pattern": "^mailto:[\\w.-]+@[\\w.-]+\\.[\\w.-]+?$", - "type": "string" + "anyOf": [ + { + "pattern": "^mailto:[\\w\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:.-]+@[\\w.-]+\\.[\\w.-]+?$", + "type": "string" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] } } }, @@ -432,8 +527,16 @@ "downloadURL": { "title": "Download URL", "description": "URL providing direct access to a downloadable file of a dataset", - "type": "string", - "format": "uri" + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] }, "mediaType": { "title": "Media Type", @@ -445,8 +548,12 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } - ] + ] }, "format": { "title": "Format", @@ -471,6 +578,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -510,6 +621,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -523,6 +638,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -536,6 +655,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] } @@ -562,7 +685,7 @@ "title": "Publisher Name", "description": "A full formatted name, eg Firstname Lastname", "type": "string", - "minLength": 1 + "minLength": 1 }, "subOrganizationOf": { "title": "Parent Organization", diff --git a/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json b/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json index 3495512b..b131a63b 100644 --- a/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json +++ b/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json @@ -361,7 +361,7 @@ "title": { "title": "Title", "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", - "type": "string", + "type": "string", "minLength": 1 } }, @@ -382,17 +382,17 @@ "enum": [ "vcard:Contact" ] - }, + }, "fn": { "title": "Contact Name", "description": "A full formatted name, eg Firstname Lastname", "type": "string", - "minLength": 1 + "minLength": 1 }, "hasEmail": { "title": "Email", "description": "Email address for the contact", - "pattern": "^mailto:[\\w.-]+@[\\w.-]+\\.[\\w.-]+?$", + "pattern": "^mailto:[\\w\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:.-]+@[\\w.-]+\\.[\\w.-]+?$", "type": "string" } } @@ -441,7 +441,7 @@ { "type": "null" } - ] + ] }, "format": { "title": "Format", @@ -557,7 +557,7 @@ "title": "Publisher Name", "description": "A full formatted name, eg Firstname Lastname", "type": "string", - "minLength": 1 + "minLength": 1 }, "subOrganizationOf": { "title": "Parent Organization", diff --git a/ckanext/datajson/pod_schema/non-federal/single_entry.json b/ckanext/datajson/pod_schema/non-federal/single_entry.json index 4ab4b311..ddc53fd4 100644 --- a/ckanext/datajson/pod_schema/non-federal/single_entry.json +++ b/ckanext/datajson/pod_schema/non-federal/single_entry.json @@ -1,415 +1,445 @@ { - "$schema": "http://json-schema.org/draft-04/schema#", - "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", - "title": "Common Core Metadata Schema", - "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", - "type": "object", - "required": ["title", "description", "license", "publisher", "contactPoint", "identifier", "accessLevel"], - "properties": { - "accessLevel": { - "description":"The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", - "title": "Public Access Level", - "enum": ["public", "restricted public", "non-public"] - }, - "accessLevelComment": { - "title":"Access Level Comment", - "description":"An explanation for the selected \"accessLevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", - "anyOf": [ - { - "type": "string", - "minLength": 1, - "maxLength":255 - }, - { - "type": "null" - } - ] - }, - "accrualPeriodicity": { - "title":"Frequency", - "description":"Frequency with which dataset is published.", - "anyOf": [ - { - "enum": ["Annual", "Bimonthly", "Semiweekly", "Daily", "Biweekly", "Semiannual", "Biennial", "Triennial", - "Three times a week", "Three times a month", "Continuously updated", "Monthly", "Quarterly", "Semimonthly", - "Three times a year", "Weekly", "Completely irregular"] - }, - { - "type": "null" - } - ] - }, - "bureauCode": { - "title":"Bureau Code", - "description":"Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{2}" - }, - "minItems": 1, - "uniqueItems": true - }, - { - "type": "null" - } - ] - }, - "contactPoint": { - "title":"Contact Name", - "description":"Contact person’s name for the asset.", - "type": "string" - }, - "dataDictionary": { - "title":"Data Dictionary", - "description":"URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", - "anyOf": [ - { - "type": "string", - "format": "uri" - }, - { - "type": "null" - } - ] - }, - "dataQuality": { - "title":"Data Quality", - "description":"Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "null" - } - ] - }, - "description": { - "title" : "Description", - "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", - "type": "string" - }, - "distribution": { - "title":"Distribution", - "description":"Holds multiple download URLs for datasets composed of multiple files and/or file types", - "anyOf": [ - { - "type": "array", - "items": { - "type": "object", - "required": ["accessURL", "format"], - "properties": { - "accessURL": { - "title":"Download URL", - "description":"URL providing direct access to the downloadable distribution of a dataset.", - "type": "string", - "format": "uri" - }, - "format": { - "title":"Format", - "description":"The file format or API type of the distribution.", - "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", - "type": "string" - } - } - }, - "minItems": 1, - "uniqueItems": true - }, - { - "type": "null" - } - ] - }, - "identifier": { - "title":"Unique Identifier", - "description":"A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", + "title": "Common Core Metadata Schema", + "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", + "type": "object", + "required": [ + "title", + "description", + "license", + "publisher", + "contactPoint", + "identifier", + "accessLevel" + ], + "properties": { + "accessLevel": { + "description": "The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", + "title": "Public Access Level", + "enum": [ + "public", + "restricted public", + "non-public" + ] + }, + "accessLevelComment": { + "title": "Access Level Comment", + "description": "An explanation for the selected \"accessLevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength": 255 + }, + { + "type": "null" + } + ] + }, + "accrualPeriodicity": { + "title": "Frequency", + "description": "Frequency with which dataset is published.", + "anyOf": [ + { + "enum": [ + "Annual", + "Bimonthly", + "Semiweekly", + "Daily", + "Biweekly", + "Semiannual", + "Biennial", + "Triennial", + "Three times a week", + "Three times a month", + "Continuously updated", + "Monthly", + "Quarterly", + "Semimonthly", + "Three times a year", + "Weekly", + "Completely irregular" + ] + }, + { + "type": "null" + } + ] + }, + "bureauCode": { + "title": "Bureau Code", + "description": "Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "contactPoint": { + "title": "Contact Name", + "description": "Contact person’s name for the asset.", + "type": "string" + }, + "dataDictionary": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "dataQuality": { + "title": "Data Quality", + "description": "Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", + "type": "string" + }, + "distribution": { + "title": "Distribution", + "description": "Holds multiple download URLs for datasets composed of multiple files and/or file types", + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "required": [ + "accessURL", + "format" + ], + "properties": { + "accessURL": { + "title": "Download URL", + "description": "URL providing direct access to the downloadable distribution of a dataset.", + "type": "string", + "format": "uri" + }, + "format": { + "title": "Format", + "description": "The file format or API type of the distribution.", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "identifier": { + "title": "Unique Identifier", + "description": "A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "type": "string", + "pattern": "[\\w]+" + }, + "issued": { + "title": "Release Date", + "description": "Date of formal issuance.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "keyword": { + "title": "Tags", + "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + { + "type": "null" + } + ] + }, + "landingPage": { + "title": "Homepage URL", + "description": "Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "language": { + "title": "Language", + "description": "The language of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" + } + }, + { + "type": "null" + } + ] + }, + "license": { + "title": "License", + "description": "The license dataset or API is published with. See Open Licenses for more information.", + "type": "string", + "minLength": 1 + }, + "mbox": { + "title": "Contact Email", + "description": "Contact person’s email address.", + "anyOf": [ + { + "type": "string", + "format": "email" + }, + { + "type": "null" + }, + { + "type": "string" + } + ] + }, + "modified": { + "title": "Last Update", + "description": "Most recent date on which the dataset was changed, updated or modified.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + } + ] + }, + "PrimaryITInvestmentUII": { + "title": "Primary IT Investment UII", + "description": "For linking a dataset with an IT Unique Investment Identifier (UII)", + "anyOf": [ + { + "type": "string", + "pattern": "[0-9]{3}-[0-9]{9}" + }, + { + "type": "null" + } + ] + }, + "programCode": { + "title": "Program Code", + "description": "Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", + "anyOf": [ + { + "type": "array", + "items": { "type": "string", - "pattern": "[\\w]+" - }, - "issued": { - "title":"Release Date", - "description":"Date of formal issuance.", - "anyOf": [ - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "null" - } - ] - }, - "keyword": { - "title": "Tags", - "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1 - }, - { - "type": "null" - } - ] - - }, - "landingPage": { - "title":"Homepage URL", - "description":"Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", - "anyOf": [ - { - "type": "string", - "format": "uri" - }, - { - "type": "null" - } - ] - }, - "language": { - "title":"Language", - "description":"The language of the dataset.", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" - } - }, - { - "type": "null" - } - ] - }, - "license": { - "title":"License", - "description":"The license dataset or API is published with. See Open Licenses for more information.", + "pattern": "[0-9]{3}:[0-9]{3}" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "publisher": { + "title": "Publisher", + "description": "The publishing entity.", + "type": "string" + }, + "references": { + "title": "Related Documents", + "description": "Related documents such as technical information about a dataset, developer documentation, etc.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "spatial": { + "title": "Spatial", + "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "systemOfRecords": { + "title": "System of Records", + "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "temporal": { + "title": "Temporal", + "description": "The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "theme": { + "title": "Category", + "description": "Main thematic category of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { "type": "string", "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", + "type": "string" + }, + "webService": { + "title": "Endpoint", + "description": "Endpoint of web service to access dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" }, - "mbox": { - "title":"Contact Email", - "description":"Contact person’s email address.", - "anyOf": [ - { - "type": "string", - "format": "email" - }, - { - "type": "null" - }, - { - "type": "string" - } - ] - }, - "modified": { - "title": "Last Update", - "description": "Most recent date on which the dataset was changed, updated or modified.", - "anyOf": [ - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - } - ] - }, - "PrimaryITInvestmentUII": { - "title":"Primary IT Investment UII", - "description":"For linking a dataset with an IT Unique Investment Identifier (UII)", - "anyOf": [ - { - "type": "string", - "pattern": "[0-9]{3}-[0-9]{9}" - }, - { - "type": "null" - } - ] - }, - "programCode": { - "title":"Program Code", - "description":"Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{3}" - }, - "minItems": 1, - "uniqueItems": true - }, - { - "type": "null" - } - ] - }, - "publisher": { - "title":"Publisher", - "description": "The publishing entity.", - "type": "string" - }, - "references": { - "title":"Related Documents", - "description":"Related documents such as technical information about a dataset, developer documentation, etc.", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "format": "uri" - }, - "minItems": 1, - "uniqueItems": true - }, - { - "type": "null" - } - ] - }, - "spatial": { - "title":"Spatial", - "description":"The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "null" - } - ] - }, - "systemOfRecords": { - "title":"System of Records", - "description":"If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "null" - } - ] - }, - "temporal": { - "title":"Temporal", - "description":"The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", - "anyOf": [ - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "null" - } - ] - }, - "theme": { - "title":"Category", - "description":"Main thematic category of the dataset.", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1, - "uniqueItems": true - }, - { - "type": "null" - } - ] - }, - "title": { - "title": "Title", - "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", - "type": "string" - }, - "webService": { - "title":"Endpoint", - "description":"Endpoint of web service to access dataset.", - "anyOf": [ - { - "type": "string", - "format": "uri" - }, - { - "type": "null" - } - ] + { + "type": "null" } + ] } + } } diff --git a/ckanext/datajson/pod_schema/single_entry.json b/ckanext/datajson/pod_schema/single_entry.json index 52dcda77..825203ad 100644 --- a/ckanext/datajson/pod_schema/single_entry.json +++ b/ckanext/datajson/pod_schema/single_entry.json @@ -1,416 +1,451 @@ { - "$schema": "http://json-schema.org/draft-04/schema#", - "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", - "title": "Common Core Metadata Schema", - "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", - "type": "object", - "required": ["bureaucode", "programcode", "title", "description", "keyword", "modified", "publisher", "contactpoint", "mbox", "identifier", "accesslevel"], - "properties": { - "accesslevel": { - "description":"The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", - "title": "Public Access Level", - "enum": ["public", "restricted public", "non-public"] - }, - "accesslevelcomment": { - "title":"Access Level Comment", - "description":"An explanation for the selected \"accesslevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", - "anyOf": [ - { - "type": "string", - "minLength": 1, - "maxLength":255 - }, - { - "type": "null" - } - ] - }, - "accessurl": { - "title":"Download URL", - "description":"URL providing direct access to the downloadable distribution of a dataset.", - "anyOf": [ - { - "type": "string", - "format": "uri" - }, - { - "type": "null" - } - ] - }, - "accrualperiodicity": { - "title":"Frequency", - "description":"Frequency with which dataset is published.", - "anyOf": [ - { - "enum": ["Annual", "Bimonthly", "Semiweekly", "Daily", "Biweekly", "Semiannual", "Biennial", "Triennial", - "Three times a week", "Three times a month", "Continuously updated", "Monthly", "Quarterly", "Semimonthly", - "Three times a year", "Weekly", "Completely irregular"] - }, - { - "type": "null" - } - ] - }, - "bureaucode": { - "title":"Bureau Code", - "description":"Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{2}" - }, - "minItems": 1, - "uniqueItems": true - }, - "contactpoint": { - "title":"Contact Name", - "description":"Contact person’s name for the asset.", - "type": "string" - }, - "datadictionary": { - "title":"Data Dictionary", - "description":"URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", - "anyOf": [ - { - "type": "string", - "format": "uri" - }, - { - "type": "null" - } - ] - }, - "dataquality": { - "title":"Data Quality", - "description":"Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "null" - } - ] - }, - "description": { - "title" : "Description", - "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", - "type": "string" - }, - "distribution": { - "title":"Distribution", - "description":"Holds multiple download URLs for datasets composed of multiple files and/or file types", - "anyOf": [ - { - "type": "array", - "items": { - "type": "object", - "required": ["accessurl", "format"], - "properties": { - "accessurl": { - "title":"Download URL", - "description":"URL providing direct access to the downloadable distribution of a dataset.", - "type": "string", - "format": "uri" - }, - "format": { - "title":"Format", - "description":"The file format or API type of the distribution.", - "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", - "type": "string" - } - } - }, - "minItems": 1, - "uniqueItems": true - }, - { - "type": "null" - } - ] - }, - "format": { - "title":"Format", - "description":"The file format or API type of the distribution.", - "anyOf": [ - { - "type": "string", - "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$" - }, - { - "type": "null" - } - ] - }, - "identifier": { - "title":"Unique Identifier", - "description":"A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", + "title": "Common Core Metadata Schema", + "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", + "type": "object", + "required": [ + "bureaucode", + "programcode", + "title", + "description", + "keyword", + "modified", + "publisher", + "contactpoint", + "mbox", + "identifier", + "accesslevel" + ], + "properties": { + "accesslevel": { + "description": "The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", + "title": "Public Access Level", + "enum": [ + "public", + "restricted public", + "non-public" + ] + }, + "accesslevelcomment": { + "title": "Access Level Comment", + "description": "An explanation for the selected \"accesslevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength": 255 + }, + { + "type": "null" + } + ] + }, + "accessurl": { + "title": "Download URL", + "description": "URL providing direct access to the downloadable distribution of a dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "accrualperiodicity": { + "title": "Frequency", + "description": "Frequency with which dataset is published.", + "anyOf": [ + { + "enum": [ + "Annual", + "Bimonthly", + "Semiweekly", + "Daily", + "Biweekly", + "Semiannual", + "Biennial", + "Triennial", + "Three times a week", + "Three times a month", + "Continuously updated", + "Monthly", + "Quarterly", + "Semimonthly", + "Three times a year", + "Weekly", + "Completely irregular" + ] + }, + { + "type": "null" + } + ] + }, + "bureaucode": { + "title": "Bureau Code", + "description": "Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + "contactpoint": { + "title": "Contact Name", + "description": "Contact person’s name for the asset.", + "type": "string" + }, + "datadictionary": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "dataquality": { + "title": "Data Quality", + "description": "Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", + "type": "string" + }, + "distribution": { + "title": "Distribution", + "description": "Holds multiple download URLs for datasets composed of multiple files and/or file types", + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "required": [ + "accessurl", + "format" + ], + "properties": { + "accessurl": { + "title": "Download URL", + "description": "URL providing direct access to the downloadable distribution of a dataset.", + "type": "string", + "format": "uri" + }, + "format": { + "title": "Format", + "description": "The file format or API type of the distribution.", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "format": { + "title": "Format", + "description": "The file format or API type of the distribution.", + "anyOf": [ + { + "type": "string", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$" + }, + { + "type": "null" + } + ] + }, + "identifier": { + "title": "Unique Identifier", + "description": "A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "type": "string", + "pattern": "[\\w]+" + }, + "issued": { + "title": "Release Date", + "description": "Date of formal issuance.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "keyword": { + "title": "Tags", + "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + "landingpage": { + "title": "Homepage URL", + "description": "Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "language": { + "title": "Language", + "description": "The language of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" + } + }, + { + "type": "null" + } + ] + }, + "license": { + "title": "License", + "description": "The license dataset or API is published with. See Open Licenses for more information.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "mbox": { + "title": "Contact Email", + "description": "Contact person’s email address.", + "type": "string", + "format": "email" + }, + "modified": { + "title": "Last Update", + "description": "Most recent date on which the dataset was changed, updated or modified.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + } + ] + }, + "primaryitinvestmentuii": { + "title": "Primary IT Investment UII", + "description": "For linking a dataset with an IT Unique Investment Identifier (UII)", + "anyOf": [ + { + "type": "string", + "pattern": "[0-9]{3}-[0-9]{9}" + }, + { + "type": "null" + } + ] + }, + "programcode": { + "title": "Program Code", + "description": "Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{3}" + }, + "minItems": 1, + "uniqueItems": true + }, + "publisher": { + "title": "Publisher", + "description": "The publishing entity.", + "type": "string" + }, + "references": { + "title": "Related Documents", + "description": "Related documents such as technical information about a dataset, developer documentation, etc.", + "anyOf": [ + { + "type": "array", + "items": { "type": "string", - "pattern": "[\\w]+" - }, - "issued": { - "title":"Release Date", - "description":"Date of formal issuance.", - "anyOf": [ - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "null" - } - ] - }, - "keyword": { - "title": "Tags", - "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1 - }, - "landingpage": { - "title":"Homepage URL", - "description":"Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", - "anyOf": [ - { - "type": "string", - "format": "uri" - }, - { - "type": "null" - } - ] - }, - "language": { - "title":"Language", - "description":"The language of the dataset.", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" - } - }, - { - "type": "null" - } - ] - }, - "license": { - "title":"License", - "description":"The license dataset or API is published with. See Open Licenses for more information.", - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "null" - } - ] - }, - "mbox": { - "title":"Contact Email", - "description":"Contact person’s email address.", + "format": "uri" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "spatial": { + "title": "Spatial", + "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "systemofrecords": { + "title": "System of Records", + "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "temporal": { + "title": "Temporal", + "description": "The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "theme": { + "title": "Category", + "description": "Main thematic category of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { "type": "string", - "format": "email" - }, - "modified": { - "title": "Last Update", - "description": "Most recent date on which the dataset was changed, updated or modified.", - "anyOf": [ - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - } - ] - }, - "primaryitinvestmentuii": { - "title":"Primary IT Investment UII", - "description":"For linking a dataset with an IT Unique Investment Identifier (UII)", - "anyOf": [ - { - "type": "string", - "pattern": "[0-9]{3}-[0-9]{9}" - }, - { - "type": "null" - } - ] - }, - "programcode": { - "title":"Program Code", - "description":"Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{3}" - }, - "minItems": 1, - "uniqueItems": true - }, - "publisher": { - "title":"Publisher", - "description": "The publishing entity.", - "type": "string" - }, - "references": { - "title":"Related Documents", - "description":"Related documents such as technical information about a dataset, developer documentation, etc.", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "format": "uri" - }, - "minItems": 1, - "uniqueItems": true - }, - { - "type": "null" - } - ] - }, - "spatial": { - "title":"Spatial", - "description":"The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "null" - } - ] - }, - "systemofrecords": { - "title":"System of Records", - "description":"If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "null" - } - ] - }, - "temporal": { - "title":"Temporal", - "description":"The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", - "anyOf": [ - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "string", - "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - { - "type": "string", - "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" - }, - { - "type": "null" - } - ] - }, - "theme": { - "title":"Category", - "description":"Main thematic category of the dataset.", - "anyOf": [ - { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1, - "uniqueItems": true - }, - { - "type": "null" - } - ] - }, - "title": { - "title": "Title", - "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", - "type": "string" - }, - "webservice": { - "title":"Endpoint", - "description":"Endpoint of web service to access dataset.", - "anyOf": [ - { - "type": "string", - "format": "uri" - }, - { - "type": "null" - } - ] + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", + "type": "string" + }, + "webservice": { + "title": "Endpoint", + "description": "Endpoint of web service to access dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" } + ] } + } } From 6d46c8ec856f64cd69c25a25466183c82f9dc875 Mon Sep 17 00:00:00 2001 From: ykhadilkar Date: Thu, 19 Mar 2015 17:04:31 -0400 Subject: [PATCH 09/22] Github # 152 - Enable flag to set a dataset as unpublished draft --- ckanext/datajson/templates/organization/read.html | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ckanext/datajson/templates/organization/read.html b/ckanext/datajson/templates/organization/read.html index fbcaca29..1c5d73ba 100644 --- a/ckanext/datajson/templates/organization/read.html +++ b/ckanext/datajson/templates/organization/read.html @@ -5,10 +5,12 @@ {% link_for _('Add Dataset'), controller='package', action='new', group=c.group_dict.id, class_='btn btn-primary', icon='plus-sign-alt' %} + {% endif %}
+
{% endblock %} From 882d1b86eb2c82cb244f5742298f8d39cd15c755 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Thu, 19 Mar 2015 17:16:04 -0400 Subject: [PATCH 10/22] Do not export Drafts on PDL & EDI --- ckanext/datajson/plugin.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index b86a8ebf..7d60826e 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -372,7 +372,8 @@ def make_edi(self, owner_org): output = [] for pkg in packages: - # if pkg['owner_org'] == owner_org: + if pkg['publishing_status'] == 'Draft': + continue datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) @@ -405,6 +406,8 @@ def make_pdl(self, owner_org): output = [] # Create data.json only using public datasets, datasets marked non-public are not exposed for pkg in packages: + if pkg['publishing_status'] == 'Draft': + continue extras = dict([(x['key'], x['value']) for x in pkg['extras']]) try: if not (re.match(r'[Nn]on-public', extras['public_access_level'])): From 1f28edde3f9cc2fe5349ef1e462e951261f0afcf Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Thu, 19 Mar 2015 17:49:10 -0400 Subject: [PATCH 11/22] Export Draft datasets only button --- ckanext/datajson/plugin.py | 77 +++++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 7d60826e..6ad09fc4 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -225,6 +225,11 @@ def after_map(self, m): m.connect('enterprise_data_inventory', '/organization/{org}/edi.json', controller='ckanext.datajson.plugin:JsonExportController', action='generate_edi') + # TODO DWC update action + # /data/{org}/edi.json + m.connect('enterprise_data_inventory', '/organization/{org}/draft.json', + controller='ckanext.datajson.plugin:JsonExportController', action='generate_draft') + # /pod/validate # m.connect('datajsonvalidator', "/pod/validate", controller='ckanext.datajson.plugin:JsonExportController', action='validator') @@ -334,6 +339,22 @@ def generate_edi(self): return self.make_edi(match.group(1)) return "Invalid organization id" + def generate_draft(self): + # DWC this is a hack, as I couldn't get to the request parameters. For whatever reason, the multidict was always empty + match = re.match(r"/organization/([-a-z0-9]+)/draft.json", request.path) + + # If user is not editor or admin of the organization then don't allow edi download + if p.toolkit.check_access('package_create', {'model': model, 'user': c.user}, {'owner_org': match.group(1)}): + if match: + # set content type (charset required or pylons throws an error) + response.content_type = 'application/json; charset=UTF-8' + + # allow caching of response (e.g. by Apache) + del response.headers["Cache-Control"] + del response.headers["Pragma"] + return self.make_draft(match.group(1)) + return "Invalid organization id" + def make_json(self): # Build the data.json file. @@ -358,6 +379,40 @@ def make_json(self): return output + def make_draft(self, owner_org): + # Error handler for creating error log + stream = StringIO.StringIO() + eh = logging.StreamHandler(stream) + eh.setLevel(logging.WARN) + formatter = logging.Formatter('%(asctime)s - %(message)s') + eh.setFormatter(formatter) + logger.addHandler(eh) + + # Build the data.json file. + packages = self.get_packages(owner_org) + + output = [] + for pkg in packages: + extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + if 'publishing_status' in extras.keys() and extras['publishing_status'] != 'Draft': + continue + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if datajson_entry and self.is_valid(datajson_entry): + output.append(datajson_entry) + else: + logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) + + # Get the error log + eh.flush() + error = stream.getvalue() + eh.close() + logger.removeHandler(eh) + stream.close() + + # return json.dumps(output) + return self.write_zip(output, error, zip_name='edi') + + def make_edi(self, owner_org): # Error handler for creating error log stream = StringIO.StringIO() @@ -372,7 +427,8 @@ def make_edi(self, owner_org): output = [] for pkg in packages: - if pkg['publishing_status'] == 'Draft': + extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + if 'publishing_status' in extras.keys() and extras['publishing_status'] == 'Draft': continue datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) if datajson_entry and self.is_valid(datajson_entry): @@ -406,17 +462,18 @@ def make_pdl(self, owner_org): output = [] # Create data.json only using public datasets, datasets marked non-public are not exposed for pkg in packages: - if pkg['publishing_status'] == 'Draft': - continue extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + if 'publishing_status' in extras.keys() and extras['publishing_status'] == 'Draft': + continue try: - if not (re.match(r'[Nn]on-public', extras['public_access_level'])): - datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) - if datajson_entry and self.is_valid(datajson_entry): - output.append(datajson_entry) - else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), - pkg.get('title', None)) + if re.match(r'[Nn]on-public', extras['public_access_level']): + continue + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if datajson_entry and self.is_valid(datajson_entry): + output.append(datajson_entry) + else: + logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None)) except KeyError: logger.warn("Dataset id=[%s], title=['%s'] missing required 'public_access_level' field", From 37c0fffd3d39a8ceb283ce93c6b2b7af0c24e87a Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Thu, 19 Mar 2015 23:32:58 -0400 Subject: [PATCH 12/22] Errors.json --- ckanext/datajson/build_datajson.py | 28 +++++++++++++++++---- ckanext/datajson/datajsonvalidator.py | 2 +- ckanext/datajson/plugin.py | 36 +++++++++++++++++++++------ 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index 27c93d8b..8c765229 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -257,9 +257,18 @@ def make_datajson_export_entry(package): JsonExportBuilder.split_multiple_entries(retlist, extras, pair) except KeyError as e: - log.warn("Invalid field detected for package with id=[%s], title=['%s']: '%s'", package.get('id'), - package.get('title'), e) - return + log.warn("Missing Required Field for package with id=[%s], title=['%s']: '%s'" % ( + package.get('id'), package.get('title'), e)) + + errors = ['Missing Required Field', ["%s" % e]] + errors_dict = OrderedDict([ + ('id', package.get('id')), + ('name', package.get('name')), + ('title', package.get('title')), + ('errors', errors), + ]) + + return errors_dict # # TODO this is a lazy hack to make sure we don't have redundant fields when the free form key/value pairs are added # extras_to_filter_out = ['publisher', 'contact_name', 'contact_email', 'unique_id', 'public_access_level', @@ -315,7 +324,15 @@ def make_datajson_export_entry(package): if len(errors) > 0: for error in errors: log.warn(error) - return + + errors_dict = OrderedDict([ + ('id', package.get('id')), + ('name', package.get('name')), + ('title', package.get('title')), + ('errors', errors), + ]) + + return errors_dict return striped_retlist_dict @@ -443,7 +460,8 @@ def extra(package, key, default=None): def get_publisher_tree_wrong_order(extras): publisher = JsonExportBuilder.strip_if_string(extras.get('publisher')) if publisher is None: - raise KeyError('publisher') + return None + # raise KeyError('publisher') organization_list = list() organization_list.append([ diff --git a/ckanext/datajson/datajsonvalidator.py b/ckanext/datajson/datajsonvalidator.py index b1102a0b..cd61d6a1 100644 --- a/ckanext/datajson/datajsonvalidator.py +++ b/ckanext/datajson/datajsonvalidator.py @@ -397,7 +397,7 @@ def check_required_field(obj, field_name, data_type, dataset_name, errs): add_error(errs, 10, "Missing Required Fields", "The '%s' field is missing." % field_name, dataset_name) return False elif obj[field_name] is None: - add_error(errs, 10, "Missing Required Fields", "The '%s' field is set to null." % field_name, dataset_name) + add_error(errs, 10, "Missing Required Fields", "The '%s' field is empty." % field_name, dataset_name) return False elif not isinstance(obj[field_name], data_type): add_error(errs, 5, "Invalid Required Field Value", diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 6ad09fc4..d04cfe5d 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -109,7 +109,7 @@ def generate_output(self, format): ("foaf", "http://xmlns.com/foaf/0.1/"), ("pod", "http://project-open-data.github.io/schema/2013-09-20_1.0#"), ]) - ), + ), ("@id", DataJsonPlugin.ld_id), ("@type", "dcat:Catalog"), ("dcterms:title", DataJsonPlugin.ld_title), @@ -150,7 +150,8 @@ def validator(self): e) + ". Try using JSONLint.com."])) except Exception as e: c.errors.append(( - "Internal Error", ["Something bad happened while trying to load and parse the file: " + unicode(e)])) + "Internal Error", + ["Something bad happened while trying to load and parse the file: " + unicode(e)])) if body: try: @@ -178,7 +179,6 @@ def show_html_rendition(self): return render('html_rendition.html') - class JsonExportPlugin(p.SingletonPlugin): p.implements(p.interfaces.IConfigurer) p.implements(p.interfaces.IRoutes, inherit=True) @@ -258,7 +258,7 @@ def generate_output(self, format): ("dcat", "http://www.w3.org/ns/dcat#"), ("foaf", "http://xmlns.com/foaf/0.1/"), ]) - ), + ), ("@id", JsonExportPlugin.ld_id), ("@type", "dcat:Catalog"), ("dcterms:title", JsonExportPlugin.ld_title), @@ -391,12 +391,17 @@ def make_draft(self, owner_org): # Build the data.json file. packages = self.get_packages(owner_org) + errors_json = [] + output = [] for pkg in packages: extras = dict([(x['key'], x['value']) for x in pkg['extras']]) if 'publishing_status' in extras.keys() and extras['publishing_status'] != 'Draft': continue datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if 'errors' in datajson_entry.keys(): + errors_json.append(datajson_entry) + datajson_entry = None if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: @@ -410,7 +415,7 @@ def make_draft(self, owner_org): stream.close() # return json.dumps(output) - return self.write_zip(output, error, zip_name='edi') + return self.write_zip(output, error, errors_json, zip_name='edi') def make_edi(self, owner_org): @@ -426,11 +431,15 @@ def make_edi(self, owner_org): packages = self.get_packages(owner_org) output = [] + errors_json = [] for pkg in packages: extras = dict([(x['key'], x['value']) for x in pkg['extras']]) if 'publishing_status' in extras.keys() and extras['publishing_status'] == 'Draft': continue datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if 'errors' in datajson_entry.keys(): + errors_json.append(datajson_entry) + datajson_entry = None if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: @@ -444,7 +453,7 @@ def make_edi(self, owner_org): stream.close() # return json.dumps(output) - return self.write_zip(output, error, zip_name='edi') + return self.write_zip(output, error, errors_json, zip_name='edi') def make_pdl(self, owner_org): @@ -460,6 +469,7 @@ def make_pdl(self, owner_org): packages = self.get_packages(owner_org) output = [] + errors_json = [] # Create data.json only using public datasets, datasets marked non-public are not exposed for pkg in packages: extras = dict([(x['key'], x['value']) for x in pkg['extras']]) @@ -469,6 +479,9 @@ def make_pdl(self, owner_org): if re.match(r'[Nn]on-public', extras['public_access_level']): continue datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if 'errors' in datajson_entry.keys(): + errors_json.append(datajson_entry) + datajson_entry = None if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: @@ -488,7 +501,7 @@ def make_pdl(self, owner_org): stream.close() # return json.dumps(output) - return self.write_zip(output, error, zip_name='pdl') + return self.write_zip(output, error, errors_json, zip_name='pdl') def get_packages(self, owner_org): @@ -530,7 +543,7 @@ def is_valid(self, instance): return True - def write_zip(self, data, error=None, zip_name='data'): + def write_zip(self, data, error=None, errors_json=None, zip_name='data'): """ Data: a python object to write to the data.json Error: unicode string representing the content of the error log. @@ -546,6 +559,13 @@ def write_zip(self, data, error=None, zip_name='data'): zf.writestr('data.json', json.dumps(JsonExportBuilder.make_datajson_export_catalog(data), ensure_ascii=False).encode( 'utf8')) + # Write empty.json if nothing to return + else: + zf.writestr('empty.json', '') + + # Errors in json format + if errors_json: + zf.writestr('errors.json', json.dumps(errors_json).encode('utf8')) # Write the error log if error: From 7851f56a713fde309c8fc24e89e011727735aeca Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 10:21:56 -0400 Subject: [PATCH 13/22] Draft export fix --- ckanext/datajson/plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index d04cfe5d..2fa5ae49 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -396,7 +396,7 @@ def make_draft(self, owner_org): output = [] for pkg in packages: extras = dict([(x['key'], x['value']) for x in pkg['extras']]) - if 'publishing_status' in extras.keys() and extras['publishing_status'] != 'Draft': + if 'publishing_status' not in extras.keys() or extras['publishing_status'] != 'Draft': continue datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) if 'errors' in datajson_entry.keys(): From d7a8627ffecf1468259facb6cb139a843b1b5d98 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 10:48:04 -0400 Subject: [PATCH 14/22] Draft filename changed --- ckanext/datajson/plugin.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 2fa5ae49..4acdac63 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -415,7 +415,7 @@ def make_draft(self, owner_org): stream.close() # return json.dumps(output) - return self.write_zip(output, error, errors_json, zip_name='edi') + return self.write_zip(output, error, errors_json, zip_name='draft') def make_edi(self, owner_org): @@ -554,9 +554,13 @@ def write_zip(self, data, error=None, errors_json=None, zip_name='data'): o = StringIO.StringIO() zf = zipfile.ZipFile(o, mode='w') + data_file_name = 'data.json' + if 'draft' == zip_name: + data_file_name = 'draft_data.json' + # Write the data file if data: - zf.writestr('data.json', + zf.writestr(data_file_name, json.dumps(JsonExportBuilder.make_datajson_export_catalog(data), ensure_ascii=False).encode( 'utf8')) # Write empty.json if nothing to return From d9093cd0c45380834f6e50f070ff2716ccb16b62 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 10:58:47 -0400 Subject: [PATCH 15/22] Export metadata_modified if extras[modified] not available --- ckanext/datajson/build_datajson.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index 8c765229..705a7e15 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -214,7 +214,9 @@ def make_datajson_export_entry(package): ("license", JsonExportBuilder.strip_if_string(extras.get("license_new"))), # required-if-applicable - ("modified", JsonExportBuilder.strip_if_string(extras.get("modified"))), # required + ("modified", + JsonExportBuilder.strip_if_string(extras.get("modified", package.get("metadata_modified")))), + # required ("primaryITInvestmentUII", JsonExportBuilder.strip_if_string(extras.get('primary_it_investment_uii'))), # optional From d6519e81424f4023d467df3930eeae16a3e9f93d Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 12:56:49 -0400 Subject: [PATCH 16/22] Json error log update --- ckanext/datajson/plugin.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 4acdac63..112dabf7 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -237,6 +237,8 @@ def after_map(self, m): class JsonExportController(BaseController): + _errors_json = [] + def generate_output(self, format): # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -355,7 +357,6 @@ def generate_draft(self): return self.make_draft(match.group(1)) return "Invalid organization id" - def make_json(self): # Build the data.json file. packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) @@ -375,10 +376,17 @@ def make_json(self): logger.warn("Dataset id=[%s], title=[%s] missing required 'public_access_level' field", pkg.get('id', None), pkg.get('title', None)) + + errors = ['Missing Required Field', ['public_access_level']] + self._errors_json.append(OrderedDict([ + ('id', pkg.get('id')), + ('name', pkg.get('name')), + ('title', pkg.get('title')), + ('errors', errors), + ])) pass return output - def make_draft(self, owner_org): # Error handler for creating error log stream = StringIO.StringIO() @@ -417,7 +425,6 @@ def make_draft(self, owner_org): # return json.dumps(output) return self.write_zip(output, error, errors_json, zip_name='draft') - def make_edi(self, owner_org): # Error handler for creating error log stream = StringIO.StringIO() @@ -455,7 +462,6 @@ def make_edi(self, owner_org): # return json.dumps(output) return self.write_zip(output, error, errors_json, zip_name='edi') - def make_pdl(self, owner_org): # Error handler for creating error log stream = StringIO.StringIO() @@ -491,6 +497,13 @@ def make_pdl(self, owner_org): except KeyError: logger.warn("Dataset id=[%s], title=['%s'] missing required 'public_access_level' field", pkg.get('id', None), pkg.get('title', None)) + errors = ['Missing Required Field', ['public_access_level']] + self._errors_json.append(OrderedDict([ + ('id', pkg.get('id')), + ('name', pkg.get('name')), + ('title', pkg.get('title')), + ('errors', errors), + ])) pass # Get the error log @@ -503,7 +516,6 @@ def make_pdl(self, owner_org): # return json.dumps(output) return self.write_zip(output, error, errors_json, zip_name='pdl') - def get_packages(self, owner_org): # Build the data.json file. packages = self.get_all_group_packages(group_id=owner_org) @@ -520,7 +532,6 @@ def get_packages(self, owner_org): return packages - def get_all_group_packages(self, group_id): """ Gets all of the group packages, public or private, returning them as a list of CKAN's dictized packages. @@ -531,7 +542,6 @@ def get_all_group_packages(self, group_id): return result - def is_valid(self, instance): """ Validates a data.json entry against the project open data's JSON schema. Log a warning message on validation error @@ -542,7 +552,6 @@ def is_valid(self, instance): return False return True - def write_zip(self, data, error=None, errors_json=None, zip_name='data'): """ Data: a python object to write to the data.json @@ -567,6 +576,12 @@ def write_zip(self, data, error=None, errors_json=None, zip_name='data'): else: zf.writestr('empty.json', '') + if self._errors_json: + if errors_json: + errors_json += self._errors_json + else: + errors_json = self._errors_json + # Errors in json format if errors_json: zf.writestr('errors.json', json.dumps(errors_json).encode('utf8')) From 37537e0b15b9c9e87526a3035ab499294e8fded8 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 13:30:56 -0400 Subject: [PATCH 17/22] Ogranization name in errors.json --- ckanext/datajson/build_datajson.py | 10 ++++++++++ ckanext/datajson/plugin.py | 22 ++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index 705a7e15..0c4acca3 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -133,8 +133,10 @@ def extension_to_mime_type(file_ext): } return ext.get(file_ext.lower(), "application/unknown") +currentPackageOrg = None class JsonExportBuilder: + @staticmethod def make_datajson_export_catalog(datasets): catalog = OrderedDict([ @@ -148,6 +150,8 @@ def make_datajson_export_catalog(datasets): @staticmethod def make_datajson_export_entry(package): + global currentPackageOrg + currentPackageOrg = None # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict extras = dict([(x['key'], x['value']) for x in package['extras']]) @@ -267,6 +271,7 @@ def make_datajson_export_entry(package): ('id', package.get('id')), ('name', package.get('name')), ('title', package.get('title')), + ('organization', currentPackageOrg), ('errors', errors), ]) @@ -331,6 +336,7 @@ def make_datajson_export_entry(package): ('id', package.get('id')), ('name', package.get('name')), ('title', package.get('title')), + ('organization', currentPackageOrg), ('errors', errors), ]) @@ -460,11 +466,14 @@ def extra(package, key, default=None): @staticmethod def get_publisher_tree_wrong_order(extras): + global currentPackageOrg publisher = JsonExportBuilder.strip_if_string(extras.get('publisher')) if publisher is None: return None # raise KeyError('publisher') + currentPackageOrg = publisher + organization_list = list() organization_list.append([ ('@type', 'org:Organization'), # optional @@ -478,6 +487,7 @@ def get_publisher_tree_wrong_order(extras): ('@type', 'org:Organization'), # optional ('name', JsonExportBuilder.strip_if_string(extras[key])), # required ]) + currentPackageOrg = extras[key] size = len(organization_list) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 112dabf7..f6641677 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -378,10 +378,22 @@ def make_json(self): pkg.get('title', None)) errors = ['Missing Required Field', ['public_access_level']] + + currentPackageOrg = None + + if 'publisher' in extras and extras['publisher']: + currentPackageOrg = JsonExportBuilder.strip_if_string(extras['publisher']) + + for i in range(1, 6): + key = 'publisher_' + str(i) + if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): + currentPackageOrg = JsonExportBuilder.strip_if_string(extras[key]) + self._errors_json.append(OrderedDict([ ('id', pkg.get('id')), ('name', pkg.get('name')), ('title', pkg.get('title')), + ('organization', currentPackageOrg), ('errors', errors), ])) pass @@ -498,10 +510,20 @@ def make_pdl(self, owner_org): logger.warn("Dataset id=[%s], title=['%s'] missing required 'public_access_level' field", pkg.get('id', None), pkg.get('title', None)) errors = ['Missing Required Field', ['public_access_level']] + + currentPackageOrg = None + if 'publisher' in extras and extras['publisher']: + currentPackageOrg = JsonExportBuilder.strip_if_string(extras['publisher']) + for i in range(1, 6): + key = 'publisher_' + str(i) + if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): + currentPackageOrg = JsonExportBuilder.strip_if_string(extras[key]) + self._errors_json.append(OrderedDict([ ('id', pkg.get('id')), ('name', pkg.get('name')), ('title', pkg.get('title')), + ('organization', currentPackageOrg), ('errors', errors), ])) pass From 252ff335976dcda3d004cfdb2f99d0d5adbc465e Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 13:47:20 -0400 Subject: [PATCH 18/22] Add organization title to errorlog.txt too --- ckanext/datajson/build_datajson.py | 4 ++-- ckanext/datajson/plugin.py | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index 0c4acca3..8a78ada0 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -263,8 +263,8 @@ def make_datajson_export_entry(package): JsonExportBuilder.split_multiple_entries(retlist, extras, pair) except KeyError as e: - log.warn("Missing Required Field for package with id=[%s], title=['%s']: '%s'" % ( - package.get('id'), package.get('title'), e)) + log.warn("Missing Required Field for package with id=[%s], title=['%s'], organization=['%s']: '%s'" % ( + package.get('id'), package.get('title'), currentPackageOrg, e)) errors = ['Missing Required Field', ["%s" % e]] errors_dict = OrderedDict([ diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index f6641677..00fa4f2a 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -373,12 +373,6 @@ def make_json(self): logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) except KeyError: - logger.warn("Dataset id=[%s], title=[%s] missing required 'public_access_level' field", - pkg.get('id', None), - pkg.get('title', None)) - - errors = ['Missing Required Field', ['public_access_level']] - currentPackageOrg = None if 'publisher' in extras and extras['publisher']: @@ -389,6 +383,14 @@ def make_json(self): if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): currentPackageOrg = JsonExportBuilder.strip_if_string(extras[key]) + logger.warn( + "Dataset id=[%s], title=[%s], organization=[%s] missing required 'public_access_level' field", + pkg.get('id', None), + pkg.get('title', None), + currentPackageOrg) + + errors = ['Missing Required Field', ['public_access_level']] + self._errors_json.append(OrderedDict([ ('id', pkg.get('id')), ('name', pkg.get('name')), @@ -425,7 +427,17 @@ def make_draft(self, owner_org): if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) + currentPackageOrg = None + + if 'publisher' in extras and extras['publisher']: + currentPackageOrg = JsonExportBuilder.strip_if_string(extras['publisher']) + + for i in range(1, 6): + key = 'publisher_' + str(i) + if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): + currentPackageOrg = JsonExportBuilder.strip_if_string(extras[key]) + logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None), currentPackageOrg) # Get the error log eh.flush() From 5b871326829f9fb8b70a26ad6222aabdf2eb65b9 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 13:55:23 -0400 Subject: [PATCH 19/22] Adding Organization to errorlog.txt, Step 2 --- ckanext/datajson/plugin.py | 71 ++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 00fa4f2a..75d3ad4c 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -370,24 +370,17 @@ def make_json(self): if datajson_entry: output.append(datajson_entry) else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), - pkg.get('title', None)) + publisher = self.detect_publisher(extras) + logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None), publisher) except KeyError: - currentPackageOrg = None - - if 'publisher' in extras and extras['publisher']: - currentPackageOrg = JsonExportBuilder.strip_if_string(extras['publisher']) - - for i in range(1, 6): - key = 'publisher_' + str(i) - if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): - currentPackageOrg = JsonExportBuilder.strip_if_string(extras[key]) + publisher = self.detect_publisher(extras) logger.warn( "Dataset id=[%s], title=[%s], organization=[%s] missing required 'public_access_level' field", pkg.get('id', None), pkg.get('title', None), - currentPackageOrg) + publisher) errors = ['Missing Required Field', ['public_access_level']] @@ -395,7 +388,7 @@ def make_json(self): ('id', pkg.get('id')), ('name', pkg.get('name')), ('title', pkg.get('title')), - ('organization', currentPackageOrg), + ('organization', publisher), ('errors', errors), ])) pass @@ -427,17 +420,9 @@ def make_draft(self, owner_org): if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: - currentPackageOrg = None - - if 'publisher' in extras and extras['publisher']: - currentPackageOrg = JsonExportBuilder.strip_if_string(extras['publisher']) - - for i in range(1, 6): - key = 'publisher_' + str(i) - if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): - currentPackageOrg = JsonExportBuilder.strip_if_string(extras[key]) + publisher = self.detect_publisher(extras) logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), - pkg.get('title', None), currentPackageOrg) + pkg.get('title', None), publisher) # Get the error log eh.flush() @@ -449,6 +434,19 @@ def make_draft(self, owner_org): # return json.dumps(output) return self.write_zip(output, error, errors_json, zip_name='draft') + @staticmethod + def detect_publisher(extras): + publisher = None + + if 'publisher' in extras and extras['publisher']: + publisher = JsonExportBuilder.strip_if_string(extras['publisher']) + + for i in range(1, 6): + key = 'publisher_' + str(i) + if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): + publisher = JsonExportBuilder.strip_if_string(extras[key]) + return publisher + def make_edi(self, owner_org): # Error handler for creating error log stream = StringIO.StringIO() @@ -474,7 +472,9 @@ def make_edi(self, owner_org): if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) + publisher = self.detect_publisher(extras) + logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None), publisher) # Get the error log eh.flush() @@ -515,27 +515,24 @@ def make_pdl(self, owner_org): if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), - pkg.get('title', None)) + publisher = self.detect_publisher(extras) + logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None), publisher) except KeyError: - logger.warn("Dataset id=[%s], title=['%s'] missing required 'public_access_level' field", - pkg.get('id', None), pkg.get('title', None)) - errors = ['Missing Required Field', ['public_access_level']] + publisher = self.detect_publisher(extras) - currentPackageOrg = None - if 'publisher' in extras and extras['publisher']: - currentPackageOrg = JsonExportBuilder.strip_if_string(extras['publisher']) - for i in range(1, 6): - key = 'publisher_' + str(i) - if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): - currentPackageOrg = JsonExportBuilder.strip_if_string(extras[key]) + logger.warn( + "Dataset id=[%s], title=['%s'], organization=['%s'] missing required 'public_access_level' field", + pkg.get('id', None), pkg.get('title', None), publisher) + + errors = ['Missing Required Field', ['public_access_level']] self._errors_json.append(OrderedDict([ ('id', pkg.get('id')), ('name', pkg.get('name')), ('title', pkg.get('title')), - ('organization', currentPackageOrg), + ('organization', publisher), ('errors', errors), ])) pass From a03925cdca2ad045b96f581a3bf47c7c2ce15ac3 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 17:09:05 -0400 Subject: [PATCH 20/22] Sub-orgs export fix --- ckanext/datajson/plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 75d3ad4c..871b615a 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -557,7 +557,7 @@ def get_packages(self, owner_org): sub_agencies = sub_agency.extras.col.target['sub-agencies'].value sub_agencies_list = sub_agencies.split(",") for sub in sub_agencies_list: - sub_packages = self, self.get_all_group_packages(group_id=sub) + sub_packages = self.get_all_group_packages(group_id=sub) for sub_package in sub_packages: packages.append(sub_package) From 1ce18139056b1570d675ae83ddc709a0b437313e Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Fri, 20 Mar 2015 17:51:33 -0400 Subject: [PATCH 21/22] Refactored for better format --- ckanext/datajson/build_datajson.py | 67 +++++++++--------------------- ckanext/datajson/plugin.py | 59 ++++++++++++++------------ 2 files changed, 52 insertions(+), 74 deletions(-) diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index 8a78ada0..c65aa6ee 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -41,7 +41,7 @@ def make_datajson_entry(package): ("dataDictionary", extra(package, "Data Dictionary")), ("accessURL", get_primary_resource(package).get("url", None)), ("webService", get_api_resource(package).get("url", None)), - ("format", extension_to_mime_type(get_primary_resource(package).get("format", None)) ), + ("format", extension_to_mime_type(get_primary_resource(package).get("format", None))), ("license", extra(package, "License Agreement")), ("spatial", extra(package, "Geographic Scope")), ("temporal", build_temporal(package)), @@ -54,8 +54,9 @@ def make_datajson_entry(package): x is not None)), ("dataQuality", extra(package, "Data Quality Met", default="true") == "true"), ("theme", [s for s in ( - extra(package, "Subject Area 1"), extra(package, "Subject Area 2"), extra(package, "Subject Area 3")) if - s is not None]), + extra(package, "Subject Area 1"), extra(package, "Subject Area 2"), extra(package, "Subject Area 3") + ) if s is not None]), + ("references", [s for s in [extra(package, "Technical Documentation")] if s is not None]), ("landingPage", package["url"]), ("systemOfRecords", extra(package, "System Of Records")), @@ -74,9 +75,9 @@ def make_datajson_entry(package): def extra(package, key, default=None): # Retrieves the value of an extras field. - for extra in package["extras"]: - if extra["key"] == key: - return extra["value"] + for xtra in package["extras"]: + if xtra["key"] == key: + return xtra["value"] return default @@ -133,10 +134,16 @@ def extension_to_mime_type(file_ext): } return ext.get(file_ext.lower(), "application/unknown") + currentPackageOrg = None + class JsonExportBuilder: + def __init__(self): + global currentPackageOrg + currentPackageOrg = None + @staticmethod def make_datajson_export_catalog(datasets): catalog = OrderedDict([ @@ -193,7 +200,7 @@ def make_datajson_export_entry(package): # ("fn", "Jane Doe"), # ("hasEmail", "mailto:jane.doe@agency.gov") # ])), # required - ('contactPoint', JsonExportBuilder.get_contact_point(extras, package)), # required + ('contactPoint', JsonExportBuilder.get_contact_point(extras)), # required ("dataQuality", JsonExportBuilder.strip_if_string(extras.get('data_quality'))), # required-if-applicable @@ -277,38 +284,8 @@ def make_datajson_export_entry(package): return errors_dict - # # TODO this is a lazy hack to make sure we don't have redundant fields when the free form key/value pairs are added - # extras_to_filter_out = ['publisher', 'contact_name', 'contact_email', 'unique_id', 'public_access_level', - # 'data_dictionary', 'bureau_code', 'program_code', 'access_level_comment', 'license_title', - # 'spatial', 'temporal', 'release_date', 'accrual_periodicity', 'language', 'granularity', - # 'data_quality', 'size', 'homepage_url', 'rss_feed', 'category', 'related_documents', - # 'system_of_records', 'system_of_records_none_related_to_this_dataset', 'tags', - # 'extrasRollup', 'format', 'accessURL', 'notes', 'publisher_1', 'publisher_2', 'publisher_3', - # 'publisher_4', 'publisher_5'] - # - # # Append any free extras (key/value pairs) that aren't part of common core but have been associated with the dataset - # # TODO really hackey, short on time, had to hardcode a lot of the names to remove. there's much better ways, maybe - # # generate a list of keys to ignore by calling a specific function to get the extras - # retlist_keys = [x for x, y in retlist] - # extras_keys = set(extras.keys()) - set(extras_to_filter_out) - # - # for key in extras_keys: - # convertedKey = underscore_to_camelcase(key) - # if convertedKey not in retlist_keys: - # retlist.append((convertedKey, extras[key])) - # Remove entries where value is None, "", or empty list [] striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []] - striped_retlist_keys = [x for x, y in striped_retlist] - - - # If a required metadata field was removed, return empty string - # for required_field in ["accessLevel", "bureauCode", "contactPoint", "description", "identifier", "keyword", - # "modified", "programCode", "publisher", "title"]: - # if required_field not in striped_retlist_keys: - # log.warn("Missing required field detected for package with id=[%s], title=['%s']: '%s'", - # package.get('id'), package.get('title'), required_field) - # return # When saved from UI DataQuality value is stored as "on" instead of True. # Check if value is "on" and replace it with True. @@ -344,7 +321,6 @@ def make_datajson_export_entry(package): return striped_retlist_dict - # used by get_accrual_periodicity accrual_periodicity_dict = { 'completely irregular': 'irregular', @@ -436,18 +412,18 @@ def generate_distribution(package): return arr @staticmethod - def get_contact_point(extras, package): + def get_contact_point(extras): for required_field in ["contact_name", "contact_email"]: if required_field not in extras.keys(): raise KeyError(required_field) email = JsonExportBuilder.strip_if_string(extras['contact_email']) if email is None or '@' not in email: - raise KeyError(required_field) + raise KeyError('contact_email') fn = JsonExportBuilder.strip_if_string(extras['contact_name']) if fn is None: - raise KeyError(required_field) + raise KeyError('contact_name') contact_point = OrderedDict([ ('@type', 'vcard:Contact'), # optional @@ -459,9 +435,9 @@ def get_contact_point(extras, package): @staticmethod def extra(package, key, default=None): # Retrieves the value of an extras field. - for extra in package["extras"]: - if extra["key"] == key: - return extra["value"] + for xtra in package["extras"]: + if xtra["key"] == key: + return xtra["value"] return default @staticmethod @@ -532,19 +508,16 @@ def strip_if_string(val): val = None return val - @staticmethod def get_primary_resource(package): # Return info about a "primary" resource. Select a good one. return JsonExportBuilder.get_best_resource(package, ("csv", "xls", "xml", "text", "zip", "rdf")) - @staticmethod def get_api_resource(package): # Return info about an API resource. return JsonExportBuilder.get_best_resource(package, ("api", "query tool")) - @staticmethod def split_multiple_entries(retlist, extras, names): found_element = string.strip(extras.get(names[1], "")) diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 871b615a..7320d3d4 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -88,7 +88,7 @@ def organization_facets(self, facets_dict, organization_type, package_type): class DataJsonController(BaseController): - def generate_output(self, format): + def generate_output(self, fmt): # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -99,7 +99,7 @@ def generate_output(self, format): # output data = self.make_json() - if format == 'json-ld': + if fmt == 'json-ld': # Convert this to JSON-LD. data = OrderedDict([ ("@context", OrderedDict([ @@ -108,8 +108,7 @@ def generate_output(self, format): ("dcat", "http://www.w3.org/ns/dcat#"), ("foaf", "http://xmlns.com/foaf/0.1/"), ("pod", "http://project-open-data.github.io/schema/2013-09-20_1.0#"), - ]) - ), + ])), ("@id", DataJsonPlugin.ld_id), ("@type", "dcat:Catalog"), ("dcterms:title", DataJsonPlugin.ld_title), @@ -137,7 +136,8 @@ def validator(self): c.source_url = request.POST["url"] c.errors = [] - import urllib, json + import urllib + import json from datajsonvalidator import do_validation body = None @@ -167,11 +167,12 @@ def show_html_rendition(self): # Shows an HTML rendition of the data.json file. Requests the file live # from http://localhost/data.json. - import urllib, json + import urllib + import json try: c.catalog_data = json.load(urllib.urlopen("http://localhost/data.json")) - except: + except Exception as e: c.catalog_data = [] c.catalog_data.sort(key=lambda x: x.get("modified"), reverse=True) @@ -212,8 +213,11 @@ def after_map(self, m): controller='ckanext.datajson.plugin:JsonExportController', action='generate_json') # TODO commenting out enterprise data inventory for right now - # m.connect('enterprisedatajson', JsonExportPlugin.route_edata_path, controller='ckanext.datajson.plugin:JsonExportController', action='generate_enterprise') - # m.connect('datajsonld', JsonExportPlugin.route_ld_path, controller='ckanext.datajson.plugin:JsonExportController', action='generate_jsonld') + # m.connect('enterprisedatajson', JsonExportPlugin.route_edata_path, + # controller='ckanext.datajson.plugin:JsonExportController', action='generate_enterprise') + + # m.connect('datajsonld', JsonExportPlugin.route_ld_path, + # controller='ckanext.datajson.plugin:JsonExportController', action='generate_jsonld') # TODO DWC update action # /data/{org}/data.json @@ -231,7 +235,8 @@ def after_map(self, m): controller='ckanext.datajson.plugin:JsonExportController', action='generate_draft') # /pod/validate - # m.connect('datajsonvalidator', "/pod/validate", controller='ckanext.datajson.plugin:JsonExportController', action='validator') + # m.connect('datajsonvalidator', "/pod/validate", + # controller='ckanext.datajson.plugin:JsonExportController', action='validator') return m @@ -239,7 +244,7 @@ def after_map(self, m): class JsonExportController(BaseController): _errors_json = [] - def generate_output(self, format): + def generate_output(self, fmt): # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -251,7 +256,7 @@ def generate_output(self, format): # output data = self.make_json() - if format == 'json-ld': + if fmt == 'json-ld': # Convert this to JSON-LD. data = OrderedDict([ ("@context", OrderedDict([ @@ -259,8 +264,7 @@ def generate_output(self, format): ("dcterms", "http://purl.org/dc/terms/"), ("dcat", "http://www.w3.org/ns/dcat#"), ("foaf", "http://xmlns.com/foaf/0.1/"), - ]) - ), + ])), ("@id", JsonExportPlugin.ld_id), ("@type", "dcat:Catalog"), ("dcterms:title", JsonExportPlugin.ld_title), @@ -283,7 +287,8 @@ def validator(self): c.source_url = request.POST["url"] c.errors = [] - import urllib, json + import urllib + import json from datajsonvalidator import do_validation body = None @@ -310,7 +315,8 @@ def validator(self): return render('datajsonvalidator.html') def generate_pdl(self): - # DWC this is a hack, as I couldn't get to the request parameters. For whatever reason, the multidict was always empty + # DWC this is a hack, as I couldn't get to the request parameters. + # For whatever reason, the multidict was always empty match = re.match(r"/organization/([-a-z0-9]+)/data.json", request.path) # If user is not editor or admin of the organization then don't allow pdl download @@ -326,7 +332,8 @@ def generate_pdl(self): return "Invalid organization id" def generate_edi(self): - # DWC this is a hack, as I couldn't get to the request parameters. For whatever reason, the multidict was always empty + # DWC this is a hack, as I couldn't get to the request parameters. + # For whatever reason, the multidict was always empty match = re.match(r"/organization/([-a-z0-9]+)/edi.json", request.path) # If user is not editor or admin of the organization then don't allow edi download @@ -342,7 +349,8 @@ def generate_edi(self): return "Invalid organization id" def generate_draft(self): - # DWC this is a hack, as I couldn't get to the request parameters. For whatever reason, the multidict was always empty + # DWC this is a hack, as I couldn't get to the request parameters. + # For whatever reason, the multidict was always empty match = re.match(r"/organization/([-a-z0-9]+)/draft.json", request.path) # If user is not editor or admin of the organization then don't allow edi download @@ -552,8 +560,8 @@ def get_packages(self, owner_org): packages = self.get_all_group_packages(group_id=owner_org) # get packages for sub-agencies. sub_agency = model.Group.get(owner_org) - if 'sub-agencies' in sub_agency.extras.col.target and \ - sub_agency.extras.col.target['sub-agencies'].state == 'active': + if 'sub-agencies' in sub_agency.extras.col.target \ + and sub_agency.extras.col.target['sub-agencies'].state == 'active': sub_agencies = sub_agency.extras.col.target['sub-agencies'].value sub_agencies_list = sub_agencies.split(",") for sub in sub_agencies_list: @@ -575,7 +583,8 @@ def get_all_group_packages(self, group_id): def is_valid(self, instance): """ - Validates a data.json entry against the project open data's JSON schema. Log a warning message on validation error + Validates a data.json entry against the project open data's JSON schema. + Log a warning message on validation error """ error = best_match(validator.iter_errors(instance)) if error: @@ -638,12 +647,8 @@ def get_validator(): from jsonschema import Draft4Validator, FormatChecker schema_path = os.path.join(os.path.dirname(__file__), 'pod_schema', 'federal-v1.1', 'dataset.json') - with open(schema_path, 'r') as file: - schema = json.loads(file.read()) + with open(schema_path, 'r') as schema: + schema = json.loads(schema.read()) return Draft4Validator(schema, format_checker=FormatChecker()) - logger.warn('Unable to create validator') - return None - - validator = get_validator() \ No newline at end of file From 3658c5dc205c626db3cf5e85bda27148578ad1a8 Mon Sep 17 00:00:00 2001 From: Alex Perfilov Date: Tue, 24 Mar 2015 10:08:34 -0400 Subject: [PATCH 22/22] allow [[REDACTED*] values to export --- ckanext/datajson/build_datajson.py | 19 +++-- ckanext/datajson/datajsonvalidator.py | 117 ++++++++++++++++---------- 2 files changed, 85 insertions(+), 51 deletions(-) diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index c65aa6ee..f8a704cf 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -139,7 +139,6 @@ def extension_to_mime_type(file_ext): class JsonExportBuilder: - def __init__(self): global currentPackageOrg currentPackageOrg = None @@ -357,6 +356,8 @@ def generate_distribution(package): if 'url' in rkeys: res_url = JsonExportBuilder.strip_if_string(r.get('url')) if res_url: + res_url = res_url.replace('http://[[REDACTED', '[[REDACTED') + res_url = res_url.replace('http://http', 'http') if 'api' == r.get('resource_type') or 'accessurl' == r.get('resource_type'): resource += [("accessURL", res_url)] else: @@ -417,18 +418,24 @@ def get_contact_point(extras): if required_field not in extras.keys(): raise KeyError(required_field) - email = JsonExportBuilder.strip_if_string(extras['contact_email']) - if email is None or '@' not in email: - raise KeyError('contact_email') - fn = JsonExportBuilder.strip_if_string(extras['contact_name']) if fn is None: raise KeyError('contact_name') + email = JsonExportBuilder.strip_if_string(extras['contact_email']) + if email is None: + raise KeyError('contact_email') + + if '[[REDACTED' not in email: + if '@' not in email: + raise KeyError('contact_email') + else: + email = 'mailto:' + email + contact_point = OrderedDict([ ('@type', 'vcard:Contact'), # optional ('fn', fn), # required - ('hasEmail', 'mailto:' + email), # required + ('hasEmail', email), # required ]) return contact_point diff --git a/ckanext/datajson/datajsonvalidator.py b/ckanext/datajson/datajsonvalidator.py index cd61d6a1..28739f9e 100644 --- a/ckanext/datajson/datajsonvalidator.py +++ b/ckanext/datajson/datajsonvalidator.py @@ -76,13 +76,19 @@ r'(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$' ) +REDACTED_REGEX = re.compile( + r'^(\[\[REDACTED).*?(\]\])$' +) + # load the OMB bureau codes on first load of this module -import urllib, csv +import urllib +import csv omb_burueau_codes = set() for row in csv.DictReader(urllib.urlopen("https://project-open-data.cio.gov/data/omb_bureau_codes.csv")): omb_burueau_codes.add(row["Agency Code"] + ":" + row["Bureau Code"]) + # main function for validation def do_validation(doc, errors_array): errs = {} @@ -136,14 +142,15 @@ def do_validation(doc, errors_array): # contactPoint - hasEmail # required if check_required_string_field(cp, "hasEmail", 9, dataset_name, errs): - import lepl.apps.rfc3696 + if not is_redacted(cp.get('hasEmail')): + import lepl.apps.rfc3696 - email_validator = lepl.apps.rfc3696.Email() - email = cp["hasEmail"].replace('mailto:', '') - if not email_validator(email): - add_error(errs, 5, "Invalid Required Field Value", - "The email address \"%s\" is not a valid email address." % email, - dataset_name) + email_validator = lepl.apps.rfc3696.Email() + email = cp["hasEmail"].replace('mailto:', '') + if not email_validator(email): + add_error(errs, 5, "Invalid Required Field Value", + "The email address \"%s\" is not a valid email address." % email, + dataset_name) # description # required check_required_string_field(item, "description", 1, dataset_name, errs) @@ -158,8 +165,9 @@ def do_validation(doc, errors_array): # keyword # required if isinstance(item.get("keyword"), (str, unicode)): - add_error(errs, 5, "Update Your File!", - "The keyword field used to be a string but now it must be an array.", dataset_name) + if not is_redacted(item.get("keyword")): + add_error(errs, 5, "Update Your File!", + "The keyword field used to be a string but now it must be an array.", dataset_name) elif check_required_field(item, "keyword", list, dataset_name, errs): for kw in item["keyword"]: if not isinstance(kw, (str, unicode)): @@ -171,7 +179,8 @@ def do_validation(doc, errors_array): # modified # required if check_required_string_field(item, "modified", 1, dataset_name, errs): - if not MODIFIED_REGEX_1.match(item['modified']) \ + if not is_redacted(item['modified']) \ + and not MODIFIED_REGEX_1.match(item['modified']) \ and not MODIFIED_REGEX_2.match(item['modified']) \ and not MODIFIED_REGEX_3.match(item['modified']): add_error(errs, 5, "Invalid Required Field Value", @@ -195,8 +204,8 @@ def do_validation(doc, errors_array): # Required-If-Applicable # dataQuality # Required-If-Applicable - if item.get("dataQuality") is None: - pass # not required + if item.get("dataQuality") is None or is_redacted(item.get("dataQuality")): + pass # not required or REDACTED elif not isinstance(item["dataQuality"], bool): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'dataQuality' must be true or false, " @@ -207,35 +216,42 @@ def do_validation(doc, errors_array): if item.get("distribution") is None: pass # not required elif not isinstance(item["distribution"], list): - add_error(errs, 50, "Invalid Field Value (Optional Fields)", - "The field 'distribution' must be an array, if present.", dataset_name) + if isinstance(item["distribution"], (str, unicode)) and is_redacted(item.get("distribution")): + pass + else: + add_error(errs, 50, "Invalid Field Value (Optional Fields)", + "The field 'distribution' must be an array, if present.", dataset_name) else: for j, dt in enumerate(item["distribution"]): + if isinstance(dt, (str, unicode)): + if is_redacted(dt): + continue distribution_name = dataset_name + (" distribution %d" % (j + 1)) # distribution - downloadURL # Required-If-Applicable - check_url_field(False, dt, "downloadURL", distribution_name, errs) + check_url_field(False, dt, "downloadURL", distribution_name, errs, True) # distribution - mediaType # Required-If-Applicable if 'downloadURL' in dt: if check_required_string_field(dt, "mediaType", 1, distribution_name, errs): - if not IANA_MIME_REGEX.match(dt["mediaType"]): + if not IANA_MIME_REGEX.match(dt["mediaType"]) \ + and not is_redacted(dt["mediaType"]): add_error(errs, 5, "Invalid Field Value", "The distribution mediaType \"%s\" is invalid. " "It must be in IANA MIME format." % dt["mediaType"], distribution_name) # distribution - accessURL # optional - check_url_field(False, dt, "accessURL", distribution_name, errs) + check_url_field(False, dt, "accessURL", distribution_name, errs, True) # distribution - conformsTo # optional - check_url_field(False, dt, "conformsTo", distribution_name, errs) + check_url_field(False, dt, "conformsTo", distribution_name, errs, True) # distribution - describedBy # optional - check_url_field(False, dt, "describedBy", distribution_name, errs) + check_url_field(False, dt, "describedBy", distribution_name, errs, True) # distribution - describedByType # optional - if dt.get("describedByType") is None: - pass # not required + if dt.get("describedByType") is None or is_redacted(dt.get("describedByType")): + pass # not required or REDACTED elif not IANA_MIME_REGEX.match(dt["describedByType"]): add_error(errs, 5, "Invalid Field Value", "The describedByType \"%s\" is invalid. " @@ -255,7 +271,7 @@ def do_validation(doc, errors_array): check_required_string_field(dt, "title", 1, distribution_name, errs) # license # Required-If-Applicable - check_url_field(False, item, "license", dataset_name, errs) + check_url_field(False, item, "license", dataset_name, errs, True) # rights # Required-If-Applicable # TODO move to warnings @@ -269,8 +285,8 @@ def do_validation(doc, errors_array): "The field 'spatial' must be a string value if specified.", dataset_name) # temporal # Required-If-Applicable - if item.get("temporal") is None: - pass # not required + if item.get("temporal") is None or is_redacted(item.get("temporal")): + pass # not required or REDACTED elif not isinstance(item["temporal"], (str, unicode)): add_error(errs, 10, "Invalid Field Value (Optional Fields)", "The field 'temporal' must be a string value if specified.", dataset_name) @@ -286,19 +302,20 @@ def do_validation(doc, errors_array): # Expanded Fields # accrualPeriodicity # optional - if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES: + if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES \ + and not is_redacted(item.get("accrualPeriodicity")): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'accrualPeriodicity' had an invalid value.", dataset_name) # conformsTo # optional - check_url_field(False, item, "conformsTo", dataset_name, errs) + check_url_field(False, item, "conformsTo", dataset_name, errs, True) # describedBy # optional - check_url_field(False, item, "describedBy", dataset_name, errs) + check_url_field(False, item, "describedBy", dataset_name, errs, True) # describedByType # optional - if item.get("describedByType") is None: - pass # not required + if item.get("describedByType") is None or is_redacted(item.get("describedByType")): + pass # not required or REDACTED elif not IANA_MIME_REGEX.match(item["describedByType"]): add_error(errs, 5, "Invalid Field Value", "The describedByType \"%s\" is invalid. " @@ -310,29 +327,29 @@ def do_validation(doc, errors_array): check_required_string_field(item, "isPartOf", 1, dataset_name, errs) # issued # optional - if item.get("issued") is not None: + if item.get("issued") is not None and not is_redacted(item.get("issued")): if not ISSUED_REGEX.match(item['issued']): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'issued' is not in a valid format.", dataset_name) # landingPage # optional - check_url_field(False, item, "landingPage", dataset_name, errs) + check_url_field(False, item, "landingPage", dataset_name, errs, True) # language # optional - if item.get("language") is None: - pass # not required + if item.get("language") is None or is_redacted(item.get("language")): + pass # not required or REDACTED elif not isinstance(item["language"], list): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'language' must be an array, if present.", dataset_name) else: for s in item["language"]: - if not LANGUAGE_REGEX.match(s): + if not LANGUAGE_REGEX.match(s) and not is_redacted(s): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'language' had an invalid language: \"%s\"" % s, dataset_name) # PrimaryITInvestmentUII # optional - if item.get("PrimaryITInvestmentUII") is None: - pass # not required + if item.get("PrimaryITInvestmentUII") is None or is_redacted(item.get("PrimaryITInvestmentUII")): + pass # not required or REDACTED elif not PRIMARY_IT_INVESTMENT_UII_REGEX.match(item["PrimaryITInvestmentUII"]): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'PrimaryITInvestmentUII' must be a string " @@ -340,13 +357,16 @@ def do_validation(doc, errors_array): # references # optional if item.get("references") is None: - pass # not required + pass # not required or REDACTED elif not isinstance(item["references"], list): - add_error(errs, 50, "Invalid Field Value (Optional Fields)", - "The field 'references' must be an array, if present.", dataset_name) + if isinstance(item["references"], (str, unicode)) and is_redacted(item.get("references")): + pass + else: + add_error(errs, 50, "Invalid Field Value (Optional Fields)", + "The field 'references' must be an array, if present.", dataset_name) else: for s in item["references"]: - if not URL_REGEX.match(s): + if not URL_REGEX.match(s) and not is_redacted(s): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'references' had an invalid URL: \"%s\"" % s, dataset_name) @@ -354,8 +374,8 @@ def do_validation(doc, errors_array): check_url_field(False, item, "systemOfRecords", dataset_name, errs) # theme #optional - if item.get("theme") is None: - pass # not required + if item.get("theme") is None or is_redacted(item.get("theme")): + pass # not required or REDACTED elif not isinstance(item["theme"], list): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'theme' must be an array.", dataset_name) @@ -374,7 +394,7 @@ def do_validation(doc, errors_array): err_type[1], # heading [err_item + (" (%d locations)" % len(errs[err_type][err_item]) if len(errs[err_type][err_item]) else "") for err_item in sorted(errs[err_type], key=lambda x: (-len(errs[err_type][x]), x)) - ])) + ])) def add_error(errs, severity, heading, description, context=None): @@ -426,11 +446,18 @@ def check_required_string_field(obj, field_name, min_length, dataset_name, errs) return True -def check_url_field(required, obj, field_name, dataset_name, errs): +def is_redacted(field): + if isinstance(field, (str, unicode)) and REDACTED_REGEX.match(field): + return True + return False + + +def check_url_field(required, obj, field_name, dataset_name, errs, allow_redacted=False): # checks that a required or optional field, if specified, looks like a URL if not required and (field_name not in obj or obj[field_name] is None): return True # not required, so OK if not check_required_field(obj, field_name, (str, unicode), dataset_name, errs): return False # just checking data type + if allow_redacted and is_redacted(obj[field_name]): return True if not URL_REGEX.match(obj[field_name]): add_error(errs, 5, "Invalid Required Field Value", "The '%s' field has an invalid URL: \"%s\"." % (field_name, obj[field_name]), dataset_name)