From 491f9af1d1250b570f6c8ce75e712b2efe8c9879 Mon Sep 17 00:00:00 2001 From: Carl Wilson Date: Sun, 11 Aug 2024 11:55:00 +0100 Subject: [PATCH 1/7] FEAT: Commons IP schema validation - removed expansions of status enums; and - added a `ruleId` alias for validation `rule_id`. --- eark_validator/model/constants.py | 8 ++++---- eark_validator/model/validation_report.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/eark_validator/model/constants.py b/eark_validator/model/constants.py index c412eca..a1c70c5 100644 --- a/eark_validator/model/constants.py +++ b/eark_validator/model/constants.py @@ -34,9 +34,9 @@ SHOULD = 'SHOULD' MUST = 'MUST' UNKNOWN = 'Unknown' -INFORMATION = 'Information' -WARNING = 'Warning' +INFORMATION = 'Info' +WARNING = 'Warn' ERROR = 'Error' -NOTWELLFORMED = 'Not Well Formed' -WELLFORMED = 'Well Formed' +NOTWELLFORMED = 'NotWellFormed' +WELLFORMED = 'WellFormed' PACKAGE = 'Package' diff --git a/eark_validator/model/validation_report.py b/eark_validator/model/validation_report.py index 7ff4675..11e6096 100644 --- a/eark_validator/model/validation_report.py +++ b/eark_validator/model/validation_report.py @@ -33,7 +33,7 @@ from typing import List, Optional import uuid -from pydantic import BaseModel +from pydantic import BaseModel, Field from .package_details import InformationPackage from .specifications import Level @@ -84,14 +84,14 @@ class Location(BaseModel): description: str = '' class Result(BaseModel): - rule_id: str | None + rule_id: Optional[str] = Field(default=None, alias='ruleId') severity: Severity = Severity.UNKNOWN location: Location | None message: str | None @unique class StructureStatus(str, Enum): - """Enum covering information package validation statuses.""" + """Enum for information package structure status values.""" UNKNOWN = UNKNOWN # Package has basic parse / structure problems and can't be validated NOTWELLFORMED = NOTWELLFORMED From 3f8ad916148649229d28247e41c38dc7d7921583 Mon Sep 17 00:00:00 2001 From: Carl Wilson Date: Tue, 27 Aug 2024 12:20:25 +0100 Subject: [PATCH 2/7] MAINT: Merge integration into current branch. --- eark_validator/mets.py | 450 +++++++++++----------- eark_validator/model/validation_report.py | 8 +- eark_validator/rules.py | 8 +- eark_validator/structure.py | 30 +- 4 files changed, 233 insertions(+), 263 deletions(-) diff --git a/eark_validator/mets.py b/eark_validator/mets.py index 56a24ff..6aaa870 100644 --- a/eark_validator/mets.py +++ b/eark_validator/mets.py @@ -1,228 +1,224 @@ -#!/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- -# -# E-ARK Validation -# Copyright (C) 2019 -# All rights reserved. -# -# Licensed to the E-ARK project under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The E-ARK project licenses -# this file to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -"""METS Schema validation.""" -import os -from pathlib import Path -from typing import Dict, List - -from lxml import etree - -from eark_validator.ipxml.schema import IP_SCHEMA -from eark_validator.ipxml.namespaces import Namespaces -from eark_validator.model.checksum import Checksum, ChecksumAlg -from eark_validator.model.metadata import FileEntry, MetsFile, MetsRoot -from eark_validator.model.validation_report import Location, Result -from eark_validator.utils import get_path -from eark_validator.const import NOT_FILE, NOT_VALID_FILE - -NAMESPACES : str = 'namespaces' -OBJID: str = 'objid' -LABEL: str = 'label' -TYPE: str = 'type' -PROFILE: str = 'profile' -OTHERTYPE: str = 'OTHERTYPE' - -START_ELE: str = 'start' -START_NS: str = 'start-ns' - -class MetsFiles(): - @staticmethod - def details_from_mets_root(namespaces: dict[str,str], root_element: etree.Element) -> MetsRoot: - return MetsRoot.model_validate({ - NAMESPACES: namespaces, - OBJID: root_element.get(OBJID.upper(), ''), - LABEL: root_element.get(LABEL.upper(), ''), - TYPE: root_element.get(TYPE.upper(), ''), - PROFILE: root_element.get(PROFILE.upper(), '') - }) - - @staticmethod - def from_file(mets_file: Path | str) -> MetsFile: - path: Path = get_path(mets_file, True) - if not path.is_file(): - raise ValueError(NOT_FILE.format(mets_file)) - ns: dict[str, str] = {} - entries: list[FileEntry] = [] - othertype = contentinformationtype = oaispackagetype = mets_root = '' - try: - parsed_mets = etree.iterparse(mets_file, events=[START_ELE, START_NS]) - for event, element in parsed_mets: - if event == START_NS: - prefix = element[0] - ns_uri = element[1] - ns[prefix] = ns_uri - if event == 'start': - if element.tag == Namespaces.METS.qualify('mets'): - mets_root: MetsRoot = MetsFiles.details_from_mets_root(ns, element) - othertype = element.get(Namespaces.CSIP.qualify(OTHERTYPE), '') - contentinformationtype = element.get( - Namespaces.CSIP.qualify('CONTENTINFORMATIONTYPE'), - '' - ) - elif element.tag == Namespaces.METS.qualify('metsHdr'): - oaispackagetype = element.get( - Namespaces.CSIP.qualify('OAISPACKAGETYPE'), '' - ) - elif element.tag in [ - Namespaces.METS.qualify('file'), - Namespaces.METS.qualify('mdRef') - ]: - entries.append(_parse_file_entry(element)) - except etree.XMLSyntaxError as ex: - raise ValueError(NOT_VALID_FILE.format(mets_file, 'XML')) from ex - return MetsFile.model_validate({ - 'root': mets_root, - 'oaispackagetype': oaispackagetype, - 'othertype': othertype, - 'contentinformationtype': contentinformationtype, - 'file_entries': entries - }) - -class MetsValidator(): - """Encapsulates METS schema validation.""" - def __init__(self, root: str): - self._validation_errors: List[Result] = [] - self._package_root: str = root - self._reps_mets: Dict[str , str] = {} - self._file_refs: List[FileEntry] = [] - - @property - def root(self) -> str: - return self._package_root - - @property - def validation_errors(self) -> List[Result]: - return self._validation_errors - - @property - def representations(self) -> List[str]: - return self._reps_mets.keys() - - @property - def representation_mets(self) -> List[str]: - return self._reps_mets.values() - - @property - def file_references(self) -> List[FileEntry]: - return self._file_refs - - @property - def is_valid(self) -> bool: - return len(self._validation_errors) == 0 - - def get_mets_path(self, rep_name: str) -> str: - return self._reps_mets[rep_name] - - def validate_mets(self, mets: str) -> bool: - ''' - Validates a Mets file. The Mets file is parsed with etree.iterparse(), - which allows event-driven parsing of large files. On certain events/conditions - actions are taken, like file validation or adding Mets files found inside - representations to a list so that they will be evaluated later on. - - @param mets: Path leading to a Mets file that will be evaluated. - @return: Boolean validation result. - ''' - # Handle relative package paths for representation METS files. - self._package_root, mets = _handle_rel_paths(self._package_root, mets) - try: - parsed_mets = etree.iterparse(mets, schema=IP_SCHEMA.get('csip')) - for _, element in parsed_mets: - self._process_element(element) - except etree.XMLSyntaxError as synt_err: - self._validation_errors.append( - Result.model_validate({ - 'rule_id': 'XML-1', - 'location': Location.model_validate({ - 'context': synt_err.filename, - 'test': str(synt_err.lineno), - 'description': str(synt_err.offset) - }), - 'message': f'File {mets} is not valid XML. {synt_err.msg}', - 'severity': 'Error' - }) - ) - return len(self._validation_errors) == 0 - - def _process_element(self, element: etree.Element) -> None: - # Define what to do with specific tags. - if element.tag == Namespaces.METS.qualify('div') and \ - element.attrib['LABEL'].lower().startswith('representations/'): - self._process_rep_div(element) - return - if element.tag in [ Namespaces.METS.qualify('file'), Namespaces.METS.qualify('mdRef') ]: - self._file_refs.append(_parse_file_entry(element)) - - def _process_rep_div(self, element: etree.Element) -> None: - rep = element.attrib['LABEL'].rsplit('/', 1)[1] - for child in element.getchildren(): - if child.tag == Namespaces.METS.qualify('mptr'): - self._reps_mets.update({ - rep: child.attrib[Namespaces.XLINK.qualify('href')] - }) - -def _parse_file_entry(element: etree.Element) -> FileEntry: - """Create a FileItem from an etree element.""" - return FileEntry.model_validate({ - 'path': _path_from_xml_element(element), - 'size': int(element.attrib['SIZE']), - 'checksum': _checksum_from_mets_element(element), - 'mimetype': element.attrib.get('MIMETYPE') or '' - }) - -def _path_from_xml_element(element: etree.Element) -> str: - loc_ele: etree.Element = element - if element.tag in [ Namespaces.METS.qualify('file'), 'file' ]: - tag: str = Namespaces.METS.qualify('FLocat') if hasattr(element, 'nsmap') else 'FLocat' - loc_ele = element.find(tag) - if element.tag in [ - Namespaces.METS.qualify('file'), - 'file', Namespaces.METS.qualify('mdRef'), - 'mdRef' - ]: - return _get_path_attrib(loc_ele) - raise ValueError(f'Element {element.tag} is not a METS:file or METS:mdRef element.') - -def _get_path_attrib(element: etree.Element) -> str: - """Get the path attribute from an etree element.""" - attrib_name = Namespaces.XLINK.qualify('href') if hasattr(element, 'nsmap') else 'href' - return element.attrib.get(attrib_name) or '' - -def _checksum_from_mets_element(element: etree.Element) -> Checksum: - """Create a Checksum from an etree element.""" - # Get the child flocat element and grab the href attribute. - return Checksum.model_validate({ - 'algorithm': ChecksumAlg.from_string(element.attrib['CHECKSUMTYPE']), - 'value': element.attrib['CHECKSUM']}, - strict=True) - -def _handle_rel_paths(rootpath: str, metspath: str) -> tuple[str, str]: - if metspath.startswith('file:///') or os.path.isabs(metspath): - return metspath.rsplit('/', 1)[0], metspath - if metspath.startswith('file://./'): - relpath = os.path.join(rootpath, metspath[9:]) - else: - relpath = os.path.join(rootpath, metspath) - return relpath.rsplit('/', 1)[0], relpath +# +# E-ARK Validation +# Copyright (C) 2019 +# All rights reserved. +# +# Licensed to the E-ARK project under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The E-ARK project licenses +# this file to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +"""METS Schema validation.""" +import os +from pathlib import Path +from typing import Dict, List + +from lxml import etree + +from eark_validator.ipxml.schema import IP_SCHEMA +from eark_validator.ipxml.namespaces import Namespaces +from eark_validator.model.checksum import Checksum, ChecksumAlg +from eark_validator.model.metadata import FileEntry, MetsFile, MetsRoot +from eark_validator.model.validation_report import Result +from eark_validator.utils import get_path +from eark_validator.const import NOT_FILE, NOT_VALID_FILE + +NAMESPACES : str = 'namespaces' +OBJID: str = 'objid' +LABEL: str = 'label' +TYPE: str = 'type' +PROFILE: str = 'profile' +OTHERTYPE: str = 'OTHERTYPE' + +START_ELE: str = 'start' +START_NS: str = 'start-ns' + +class MetsFiles(): + @staticmethod + def details_from_mets_root(namespaces: dict[str,str], root_element: etree.Element) -> MetsRoot: + return MetsRoot.model_validate({ + NAMESPACES: namespaces, + OBJID: root_element.get(OBJID.upper(), ''), + LABEL: root_element.get(LABEL.upper(), ''), + TYPE: root_element.get(TYPE.upper(), ''), + PROFILE: root_element.get(PROFILE.upper(), '') + }) + + @staticmethod + def from_file(mets_file: Path | str) -> MetsFile: + path: Path = get_path(mets_file, True) + if not path.is_file(): + raise ValueError(NOT_FILE.format(mets_file)) + ns: dict[str, str] = {} + entries: list[FileEntry] = [] + othertype = contentinformationtype = oaispackagetype = mets_root = '' + try: + parsed_mets = etree.iterparse(mets_file, events=[START_ELE, START_NS]) + for event, element in parsed_mets: + if event == START_NS: + prefix = element[0] + ns_uri = element[1] + ns[prefix] = ns_uri + if event == 'start': + if element.tag == Namespaces.METS.qualify('mets'): + mets_root: MetsRoot = MetsFiles.details_from_mets_root(ns, element) + othertype = element.get(Namespaces.CSIP.qualify(OTHERTYPE), '') + contentinformationtype = element.get( + Namespaces.CSIP.qualify('CONTENTINFORMATIONTYPE'), + '' + ) + elif element.tag == Namespaces.METS.qualify('metsHdr'): + oaispackagetype = element.get( + Namespaces.CSIP.qualify('OAISPACKAGETYPE'), '' + ) + elif element.tag in [ + Namespaces.METS.qualify('file'), + Namespaces.METS.qualify('mdRef') + ]: + entries.append(_parse_file_entry(element)) + except etree.XMLSyntaxError as ex: + raise ValueError(NOT_VALID_FILE.format(mets_file, 'XML')) from ex + return MetsFile.model_validate({ + 'root': mets_root, + 'oaispackagetype': oaispackagetype, + 'othertype': othertype, + 'contentinformationtype': contentinformationtype, + 'file_entries': entries + }) + +class MetsValidator(): + """Encapsulates METS schema validation.""" + def __init__(self, root: str): + self._validation_errors: List[Result] = [] + self._package_root: str = root + self._reps_mets: Dict[str , str] = {} + self._file_refs: List[FileEntry] = [] + + @property + def root(self) -> str: + return self._package_root + + @property + def validation_errors(self) -> List[Result]: + return self._validation_errors + + @property + def representations(self) -> List[str]: + return self._reps_mets.keys() + + @property + def representation_mets(self) -> List[str]: + return self._reps_mets.values() + + @property + def file_references(self) -> List[FileEntry]: + return self._file_refs + + @property + def is_valid(self) -> bool: + return len(self._validation_errors) == 0 + + def get_mets_path(self, rep_name: str) -> str: + return self._reps_mets[rep_name] + + def validate_mets(self, mets: str) -> bool: + ''' + Validates a Mets file. The Mets file is parsed with etree.iterparse(), + which allows event-driven parsing of large files. On certain events/conditions + actions are taken, like file validation or adding Mets files found inside + representations to a list so that they will be evaluated later on. + + @param mets: Path leading to a Mets file that will be evaluated. + @return: Boolean validation result. + ''' + # Handle relative package paths for representation METS files. + self._package_root, mets = _handle_rel_paths(self._package_root, mets) + try: + parsed_mets = etree.iterparse(mets, schema=IP_SCHEMA.get('csip')) + for _, element in parsed_mets: + self._process_element(element) + except etree.XMLSyntaxError as synt_err: + self._validation_errors.append( + Result.model_validate({ + 'rule_id': 'XML-1', + 'location': synt_err.filename + str(synt_err.lineno) + str(synt_err.offset), + 'message': f'File {mets} is not valid XML. {synt_err.msg}', + 'severity': 'Error' + }) + ) + return len(self._validation_errors) == 0 + + def _process_element(self, element: etree.Element) -> None: + # Define what to do with specific tags. + if element.tag == Namespaces.METS.qualify('div') and \ + element.attrib['LABEL'].lower().startswith('representations/'): + self._process_rep_div(element) + return + if element.tag in [ Namespaces.METS.qualify('file'), Namespaces.METS.qualify('mdRef') ]: + self._file_refs.append(_parse_file_entry(element)) + + def _process_rep_div(self, element: etree.Element) -> None: + rep = element.attrib['LABEL'].rsplit('/', 1)[1] + for child in element.getchildren(): + if child.tag == Namespaces.METS.qualify('mptr'): + self._reps_mets.update({ + rep: child.attrib[Namespaces.XLINK.qualify('href')] + }) + +def _parse_file_entry(element: etree.Element) -> FileEntry: + """Create a FileItem from an etree element.""" + return FileEntry.model_validate({ + 'path': _path_from_xml_element(element), + 'size': int(element.attrib['SIZE']), + 'checksum': _checksum_from_mets_element(element), + 'mimetype': element.attrib.get('MIMETYPE') or '' + }) + +def _path_from_xml_element(element: etree.Element) -> str: + loc_ele: etree.Element = element + if element.tag in [ Namespaces.METS.qualify('file'), 'file' ]: + tag: str = Namespaces.METS.qualify('FLocat') if hasattr(element, 'nsmap') else 'FLocat' + loc_ele = element.find(tag) + if element.tag in [ + Namespaces.METS.qualify('file'), + 'file', Namespaces.METS.qualify('mdRef'), + 'mdRef' + ]: + return _get_path_attrib(loc_ele) + raise ValueError(f'Element {element.tag} is not a METS:file or METS:mdRef element.') + +def _get_path_attrib(element: etree.Element) -> str: + """Get the path attribute from an etree element.""" + attrib_name = Namespaces.XLINK.qualify('href') if hasattr(element, 'nsmap') else 'href' + return element.attrib.get(attrib_name) or '' + +def _checksum_from_mets_element(element: etree.Element) -> Checksum: + """Create a Checksum from an etree element.""" + # Get the child flocat element and grab the href attribute. + return Checksum.model_validate({ + 'algorithm': ChecksumAlg.from_string(element.attrib['CHECKSUMTYPE']), + 'value': element.attrib['CHECKSUM']}, + strict=True) + +def _handle_rel_paths(rootpath: str, metspath: str) -> tuple[str, str]: + if metspath.startswith('file:///') or os.path.isabs(metspath): + return metspath.rsplit('/', 1)[0], metspath + if metspath.startswith('file://./'): + relpath = os.path.join(rootpath, metspath[9:]) + else: + relpath = os.path.join(rootpath, metspath) + return relpath.rsplit('/', 1)[0], relpath diff --git a/eark_validator/model/validation_report.py b/eark_validator/model/validation_report.py index 11e6096..eebc6ba 100644 --- a/eark_validator/model/validation_report.py +++ b/eark_validator/model/validation_report.py @@ -77,16 +77,10 @@ def from_level(cls, level: Level) -> 'Severity': return Severity.WARNING return Severity.INFORMATION -class Location(BaseModel): - """All details of the location of an error.""" - context: str = '' - test: str = '' - description: str = '' - class Result(BaseModel): rule_id: Optional[str] = Field(default=None, alias='ruleId') severity: Severity = Severity.UNKNOWN - location: Location | None + location: str | None message: str | None @unique diff --git a/eark_validator/rules.py b/eark_validator/rules.py index 5382752..92c82b7 100644 --- a/eark_validator/rules.py +++ b/eark_validator/rules.py @@ -29,7 +29,7 @@ from lxml import etree as ET from eark_validator.ipxml.schematron import SchematronRuleset, SVRL_NS, get_schematron_path -from eark_validator.model.validation_report import Location, Result +from eark_validator.model.validation_report import Result from eark_validator.specifications.specification import EarkSpecification, Specification, SpecificationType, SpecificationVersion from eark_validator.const import NO_PATH, NOT_FILE from eark_validator.model import Severity @@ -112,11 +112,7 @@ def from_element(rule: ET.Element, failed_assert: ET.Element) -> Result: severity = Severity.from_role(failed_assert.get('role', Severity.ERROR)) location = failed_assert.get('location') message = failed_assert.find(SVRL_NS + 'text').text - location = Location.model_validate({ - 'context':context, - 'test':test, - 'description': location - }) + location = context + test + location return Result.model_validate({ 'rule_id': rule_id, 'location':location, 'message':message, 'severity':severity }) diff --git a/eark_validator/structure.py b/eark_validator/structure.py index 8bbecb9..31762cf 100644 --- a/eark_validator/structure.py +++ b/eark_validator/structure.py @@ -26,7 +26,6 @@ import os from pathlib import Path from typing import Dict, List, Optional, Set, Tuple -from eark_validator.model.validation_report import Location from eark_validator.specifications.struct_reqs import REQUIREMENTS from eark_validator.infopacks.package_handler import PackageHandler, PackageError @@ -134,10 +133,7 @@ def get_test_results(self) -> StructResults: results: List[Result] = self.get_root_results() results = results + self.get_package_results() for name, tests in self.representations.items(): - location = Location.model_validate({ - 'context': str(name), - 'description': 'representation' - }) + location = str(name) + ' representation' if not tests.has_data(): results.append(test_result_from_id(11, location)) if not tests.has_mets(): @@ -149,18 +145,15 @@ def get_test_results(self) -> StructResults: 'messages': results }) - def get_representations(self) -> List[Representation]: - reps: List[Representation] = [] + def get_representations(self) -> Dict[str, str]: + reps: Dict[str, str] = {} for rep in self.representations: # pylint: disable=C0201 reps.append(Representation.model_validate({ 'name': rep })) return reps def get_root_results(self) -> List[Result]: results: List[Result] = [] - location: Location = Location.model_validate({ - 'context': 'root', - 'description': self.name - }) + location: str = 'root ' + self.name if not self.parser.is_archive: results.append(test_result_from_id(3, location)) if not self.parser.has_mets(): @@ -201,19 +194,13 @@ def _get_schema_results(self) -> Optional[Result]: for tests in self.representations.values(): if tests.has_schemas(): return None - return test_result_from_id(15, Location.model_validate({ - 'context': 'root', - 'description': self.name - })) + return test_result_from_id(15, 'root ' + self.name) def _get_dox_results(self) -> Optional[Result]: for tests in self.representations.values(): if tests.has_documentation(): return None - return test_result_from_id(16, Location.model_validate({ - 'context': 'root', - 'description': self.name - })) + return test_result_from_id(16, 'root ' + self.name) @classmethod def get_status(cls, results: List[Result]) -> StructureStatus: @@ -252,10 +239,7 @@ def get_bad_path_results(path) -> StructResults: }) def _get_str1_result_list(name: str) -> List[Result]: - return [ test_result_from_id(1, Location.model_validate({ - 'context': 'root', - 'description': str(name) - })) ] + return [ test_result_from_id(1, 'root ' + str(name)) ] def validate(to_validate) -> Tuple[bool, StructResults]: try: From 834fdfa400a23129777c863aaff72f065125fd8e Mon Sep 17 00:00:00 2001 From: Carl Wilson Date: Tue, 27 Aug 2024 20:59:09 +0100 Subject: [PATCH 3/7] FIX: Deserialisation of rule_id - added a `model_config` paramter to allow both `rule_id` and `ruleId` as keys in Pydantic model validation; - fixes issue with deserialisation of `rule_id` in `Rule` model. --- eark_validator/model/validation_report.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/eark_validator/model/validation_report.py b/eark_validator/model/validation_report.py index eebc6ba..34a94f5 100644 --- a/eark_validator/model/validation_report.py +++ b/eark_validator/model/validation_report.py @@ -33,7 +33,7 @@ from typing import List, Optional import uuid -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from .package_details import InformationPackage from .specifications import Level @@ -78,11 +78,13 @@ def from_level(cls, level: Level) -> 'Severity': return Severity.INFORMATION class Result(BaseModel): - rule_id: Optional[str] = Field(default=None, alias='ruleId') + model_config = ConfigDict(populate_by_name=True) + rule_id: Optional[str] = Field(validation_alias='ruleId') severity: Severity = Severity.UNKNOWN location: str | None message: str | None + @unique class StructureStatus(str, Enum): """Enum for information package structure status values.""" From 2935fdb8caebfe02d910aff7ab1522075347a899 Mon Sep 17 00:00:00 2001 From: Carl Wilson Date: Tue, 27 Aug 2024 21:05:57 +0100 Subject: [PATCH 4/7] FEAT: Convert commons-ip representations - added a `model_validator` to the `PackageDetails` class to convert incoming JSON dictionary to a `List` for now; - added PyDantic config to `Result` to allow multiple names for `rule_id` during validation; - reverted change to dictionary as it was not necessary; - added tests and data for deserialisation of commons-ip types; and - fixed minor compiler warnings. --- eark_validator/model/package_details.py | 18 +++++++- eark_validator/packages.py | 2 +- eark_validator/structure.py | 13 +++--- tests/resources/json/__init__.py | 30 +++++++++++++ tests/resources/json/commons-ip-report.json | 49 +++++++++++++++++++++ tests/rules_test.py | 12 ++++- tests/specification_test.py | 13 +++--- 7 files changed, 121 insertions(+), 16 deletions(-) create mode 100644 tests/resources/json/__init__.py create mode 100644 tests/resources/json/commons-ip-report.json diff --git a/eark_validator/model/package_details.py b/eark_validator/model/package_details.py index af724c9..853a52b 100644 --- a/eark_validator/model/package_details.py +++ b/eark_validator/model/package_details.py @@ -27,13 +27,14 @@ E-ARK : Information Package Validation Information Package Package Details type """ -from typing import List, Optional +from typing import Any, List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, ValidationInfo, model_validator from .checksum import Checksum from .metadata import MetsFile + class PackageDetails(BaseModel): label: str = '' oaispackagetype: str = '' @@ -50,3 +51,16 @@ class InformationPackage(BaseModel): mets: Optional[MetsFile] = None package: Optional[PackageDetails] = None representations: List[Representation] = [] + + @model_validator(mode='before') + @classmethod + def convert_dict(cls, data: Any) -> list[Representation]: + representations = data.get('representations') + if isinstance(representations, dict): + # If this is a dict type then it's a commons-ip type, coerce to list + reps : list[Representation] = [] + for k, v in representations.items(): + reps.append(Representation(name=v,)) + data['representations'] = reps + # Return the reps for further validation. + return data diff --git a/eark_validator/packages.py b/eark_validator/packages.py index 0143d29..404bbcb 100644 --- a/eark_validator/packages.py +++ b/eark_validator/packages.py @@ -86,7 +86,7 @@ def version(self) -> SpecificationVersion: return self._version @classmethod - def validate(self, version: SpecificationVersion, to_validate: Path) -> ValidationReport: + def validate(cls, version: SpecificationVersion, to_validate: Path) -> ValidationReport: """Returns the validation report that results from validating the path to_validate as a folder. The method does not validate archive files.""" is_struct_valid, struct_results = structure.validate(to_validate) diff --git a/eark_validator/structure.py b/eark_validator/structure.py index 31762cf..04e4f6b 100644 --- a/eark_validator/structure.py +++ b/eark_validator/structure.py @@ -39,6 +39,7 @@ METS_NAME = 'METS.xml' STR_REQ_PREFIX = 'CSIPSTR' +ROOT = 'root ' DIR_NAMES = { 'DATA': 'data', 'DESC': 'descriptive', @@ -145,15 +146,15 @@ def get_test_results(self) -> StructResults: 'messages': results }) - def get_representations(self) -> Dict[str, str]: - reps: Dict[str, str] = {} + def get_representations(self) -> List[Representation]: + reps: List[Representation] = [] for rep in self.representations: # pylint: disable=C0201 reps.append(Representation.model_validate({ 'name': rep })) return reps def get_root_results(self) -> List[Result]: results: List[Result] = [] - location: str = 'root ' + self.name + location: str = ROOT + self.name if not self.parser.is_archive: results.append(test_result_from_id(3, location)) if not self.parser.has_mets(): @@ -194,13 +195,13 @@ def _get_schema_results(self) -> Optional[Result]: for tests in self.representations.values(): if tests.has_schemas(): return None - return test_result_from_id(15, 'root ' + self.name) + return test_result_from_id(15, ROOT + self.name) def _get_dox_results(self) -> Optional[Result]: for tests in self.representations.values(): if tests.has_documentation(): return None - return test_result_from_id(16, 'root ' + self.name) + return test_result_from_id(16, ROOT + self.name) @classmethod def get_status(cls, results: List[Result]) -> StructureStatus: @@ -239,7 +240,7 @@ def get_bad_path_results(path) -> StructResults: }) def _get_str1_result_list(name: str) -> List[Result]: - return [ test_result_from_id(1, 'root ' + str(name)) ] + return [ test_result_from_id(1, ROOT + str(name)) ] def validate(to_validate) -> Tuple[bool, StructResults]: try: diff --git a/tests/resources/json/__init__.py b/tests/resources/json/__init__.py new file mode 100644 index 0000000..d9d1168 --- /dev/null +++ b/tests/resources/json/__init__.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# flake8: noqa +# -*- coding: utf-8 -*- +# +# E-ARK Validation +# Copyright (C) 2019 +# All rights reserved. +# +# Licensed to the E-ARK project under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The E-ARK project licenses +# this file to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +""" +E-ARK : Information Package Validation + JSON report representation +""" diff --git a/tests/resources/json/commons-ip-report.json b/tests/resources/json/commons-ip-report.json new file mode 100644 index 0000000..815d66a --- /dev/null +++ b/tests/resources/json/commons-ip-report.json @@ -0,0 +1,49 @@ +{ + "package" : { + "details" : { + "name" : "733dc055-34be-4260-85c7-5549a7083031.zip", + "checksums" : [ { + "algorithm" : "SHA1", + "value" : "B8EFAE7679EF63CDB9EF80B643672EE31E1C2898" + } ] + }, + "profile" : { + "type" : null, + "name" : "733dc055-34be-4260-85c7-5549a7083031.zip", + "version" : "CSIP-" + }, + "representations" : { + "name" : "733dc055-34be-4260-85c7-5549a7083031.zip" + } + }, + "uid" : "d991e991-8abc-4ad3-984c-c936de6c53d7", + "structure" : { + "status" : "WellFormed", + "messages" : [ { + "ruleId" : "CSIPSTR16", + "location" : "", + "message" : "There is no documentation folder in the representation folder 733dc055-34be-4260-85c7-5549a7083031/representations/rep1. ", + "severity" : "Warn" + } ] + }, + "metadata" : { + "schemaResults" : { + "status" : "VALID", + "messages" : [ ] + }, + "schematronResults" : { + "status" : "VALID", + "messages" : [ { + "ruleId" : "CSIP17", + "location" : "mets/dmdSec", + "message" : "Doesn't have files in metadata/descriptive folder but have dmdSec in 733dc055-34be-4260-85c7-5549a7083031/representations/rep1/METS.xml; Put the files under metadata folder ", + "severity" : "Warn" + }, { + "ruleId" : "CSIP45", + "location" : "mets/amdSec/rightsMD", + "message" : "Individual representations should state their specific rights in their representation METS file (733dc055-34be-4260-85c7-5549a7083031/representations/rep1/METS.xml) Individual representations should state their specific rights in their representation METS file (Root METS.xml) ", + "severity" : "Info" + } ] + } + } +} diff --git a/tests/rules_test.py b/tests/rules_test.py index cf68634..3d3f852 100644 --- a/tests/rules_test.py +++ b/tests/rules_test.py @@ -31,13 +31,16 @@ from pydantic import ValidationError from eark_validator import rules as SC -from eark_validator.model.validation_report import Severity, Result +from eark_validator.model.validation_report import Severity, Result, ValidationReport from eark_validator.specifications.specification import SpecificationType, SpecificationVersion import tests.resources.schematron as SCHEMATRON import tests.resources.xml as XML +import tests.resources.json as JSON TEST_RES = 'tests.resources' TEST_RES_XML = TEST_RES + '.xml' +TEST_RES_JSON = TEST_RES + '.json' +COMMONS_IP_JSON = str(files(JSON).joinpath('commons-ip-report.json')) PERSON_PATH = str(files(SCHEMATRON).joinpath('person.xml')) NOT_FOUND_PATH = str(files(SCHEMATRON).joinpath('not-found.xml')) EMPTY_FILE_PATH = str(files(TEST_RES).joinpath('empty.file')) @@ -225,6 +228,13 @@ def test_get_bad_key(self): result = profile.get_result('badkey') self.assertIsNone(result) + def test_deserialise_commons_ip_report(self): + file_name = COMMONS_IP_JSON + with open(file_name, 'r', encoding='utf-8') as _f: + contents = _f.read() + result: ValidationReport = ValidationReport.model_validate_json(contents) + self.assertIsNotNone(result) + class SeverityTest(str, Enum): NOT_SEV = 'NOT_SEV' diff --git a/tests/specification_test.py b/tests/specification_test.py index 785163e..cfef0a0 100644 --- a/tests/specification_test.py +++ b/tests/specification_test.py @@ -23,6 +23,7 @@ # under the License. # +from typing import Optional import unittest from lxml import etree as ET @@ -72,12 +73,12 @@ def test_from_rule_no(self): class SpecificationTypeTest(unittest.TestCase): def test_value(self): - type = SpecificationType.CSIP - self.assertEqual(type.value, 'E-ARK-CSIP') - type = SpecificationType.SIP - self.assertEqual(type.value, 'E-ARK-SIP') - type = SpecificationType.DIP - self.assertEqual(type.value, 'E-ARK-DIP') + spec_type: SpecificationType = SpecificationType.CSIP + self.assertEqual(spec_type.value, 'E-ARK-CSIP') + spec_type = SpecificationType.SIP + self.assertEqual(spec_type.value, 'E-ARK-SIP') + spec_type = SpecificationType.DIP + self.assertEqual(spec_type.value, 'E-ARK-DIP') class SpecificationVersionTest(unittest.TestCase): def test_value(self): From 56deacc7b733571babcc3d550815163df165fec5 Mon Sep 17 00:00:00 2001 From: Carl Wilson Date: Wed, 28 Aug 2024 14:29:58 +0100 Subject: [PATCH 5/7] FEAT: Final commons-ip compatibility tweaks - refactored `MetadataResults` to match `commons-ip`, it's probably better as well; - moved `name` from `InformationPackage` to `PackageDetails` class; - renamed `InformationPackage.package` to `InformationPackage.details`; - renamed existing `ValidationReport.convert_dict` validator to `ValidationReport.convert_representations_dict` (more explicit); - added a second validator, `VaidationReport.convert_checksum_ids`, to convert `commons-ip` checksum ids to `eark_validator` hyphenated form; - `is_valid` convenience property to `ValidationReport` class; - string constants for 'VALID' and 'INVALID'; and - fixed tests to accomodate. --- .../infopacks/information_package.py | 4 +-- eark_validator/model/constants.py | 2 ++ eark_validator/model/package_details.py | 22 ++++++++++++-- eark_validator/model/validation_report.py | 29 +++++++++++++++---- eark_validator/packages.py | 15 ++++++---- tests/ips_test.py | 2 +- 6 files changed, 57 insertions(+), 17 deletions(-) diff --git a/eark_validator/infopacks/information_package.py b/eark_validator/infopacks/information_package.py index 7f04aca..b88e247 100644 --- a/eark_validator/infopacks/information_package.py +++ b/eark_validator/infopacks/information_package.py @@ -70,6 +70,7 @@ def details_from_mets_file(mets_file: Path) -> PackageDetails: except (etree.XMLSyntaxError, AttributeError) as ex: raise ValueError(NOT_VALID_FILE.format(mets_file, 'XML')) from ex return PackageDetails.model_validate({ + 'name': mets_file.parent.stem, 'label': label, 'othertype': othertype, CONTENTINFORMATIONTYPE: contentinformationtype, @@ -87,9 +88,8 @@ def from_path(package_path: Path) -> InformationPackage: raise ValueError('No METS file found in package') mets: MetsFile = MetsFiles.from_file(to_parse.joinpath(METS_FILE)) return InformationPackage.model_validate({ - 'name': to_parse.stem, METS: mets, - 'package': InformationPackages.details_from_mets_file(to_parse.joinpath(METS_FILE)) + 'details': InformationPackages.details_from_mets_file(to_parse.joinpath(METS_FILE)) }) @staticmethod diff --git a/eark_validator/model/constants.py b/eark_validator/model/constants.py index a1c70c5..05b0211 100644 --- a/eark_validator/model/constants.py +++ b/eark_validator/model/constants.py @@ -40,3 +40,5 @@ NOTWELLFORMED = 'NotWellFormed' WELLFORMED = 'WellFormed' PACKAGE = 'Package' +VALID = 'VALID' +INVALID = 'INVALID' diff --git a/eark_validator/model/package_details.py b/eark_validator/model/package_details.py index 853a52b..23ca48a 100644 --- a/eark_validator/model/package_details.py +++ b/eark_validator/model/package_details.py @@ -36,6 +36,7 @@ class PackageDetails(BaseModel): + name: str = '' label: str = '' oaispackagetype: str = '' othertype: str = '' @@ -47,14 +48,13 @@ class Representation(BaseModel): name: Optional[str] = '' class InformationPackage(BaseModel): - name: str = '' mets: Optional[MetsFile] = None - package: Optional[PackageDetails] = None + details: Optional[PackageDetails] = None representations: List[Representation] = [] @model_validator(mode='before') @classmethod - def convert_dict(cls, data: Any) -> list[Representation]: + def convert_representations_dict(cls, data: Any) -> list[Representation]: representations = data.get('representations') if isinstance(representations, dict): # If this is a dict type then it's a commons-ip type, coerce to list @@ -64,3 +64,19 @@ def convert_dict(cls, data: Any) -> list[Representation]: data['representations'] = reps # Return the reps for further validation. return data + + @model_validator(mode='before') + @classmethod + def convert_checksum_ids(cls, data: Any) -> list[Representation]: + details = data.get('details', {}) + if isinstance(details, dict): + incoming_checksums = details.get('checksums', []) + checksums : list[Checksum] = [] + for checksum in incoming_checksums: + alg_name = checksum.get('algorithm') + if alg_name and alg_name.startswith('SHA') and '-' not in alg_name: + alg_name = f'{alg_name[:3]}-{alg_name[3:]}' + checksums.append(Checksum(algorithm=alg_name, value=checksum.get('value'))) + data['details']['checksums'] = checksums + # Return the reps for further validation. + return data diff --git a/eark_validator/model/validation_report.py b/eark_validator/model/validation_report.py index 34a94f5..05aa2a2 100644 --- a/eark_validator/model/validation_report.py +++ b/eark_validator/model/validation_report.py @@ -38,7 +38,7 @@ from .package_details import InformationPackage from .specifications import Level from .constants import ( - UNKNOWN, INFORMATION, WARNING, ERROR, WELLFORMED, NOTWELLFORMED) + UNKNOWN, INFORMATION, WARNING, ERROR, WELLFORMED, NOTWELLFORMED, VALID, INVALID) @unique class Severity(str, Enum): @@ -110,12 +110,31 @@ def warnings(self) -> List[Result]: def infos(self) -> List[Result]: return [m for m in self.messages if m.severity == Severity.INFORMATION] -class MetatdataResults(BaseModel): - schema_results: List[Result] = [] - schematron_results: List[Result] = [] +@unique +class MetadataStatus(str, Enum): + """Enum for information package metadata status values.""" + UNKNOWN = UNKNOWN + # Package metadata is valid according to the schema/schematron rules + VALID = VALID + # Package metadata is invalid according to the schema/schematron rules + INVALID = INVALID + +class MetadataResults(BaseModel): + status: MetadataStatus = MetadataStatus.UNKNOWN + messages: List[Result] = [] + +class MetatdataResultSet(BaseModel): + model_config = ConfigDict(populate_by_name=True) + schema_results: MetadataResults = Field(validation_alias='schemaResults') + model_config = ConfigDict(populate_by_name=True) + schematron_results: MetadataResults = Field(validation_alias='schematronResults') class ValidationReport(BaseModel): uid: uuid.UUID = uuid.uuid4() structure: Optional[StructResults] = None - metadata: Optional[MetatdataResults] = None + metadata: Optional[MetatdataResultSet] = None package: Optional[InformationPackage] = None + + @property + def is_valid(self) -> bool: + return self.structure.status == StructureStatus.WELLFORMED and self.metadata.schema_results.status == MetadataStatus.VALID and self.metadata.schematron_results.status == MetadataStatus.VALID diff --git a/eark_validator/packages.py b/eark_validator/packages.py index 404bbcb..6bb28fd 100644 --- a/eark_validator/packages.py +++ b/eark_validator/packages.py @@ -35,7 +35,7 @@ from eark_validator.mets import MetsValidator from eark_validator.model import ValidationReport from eark_validator.model.package_details import InformationPackage -from eark_validator.model.validation_report import MetatdataResults +from eark_validator.model.validation_report import MetadataResults, MetadataStatus, MetatdataResultSet, Result, Severity from eark_validator.specifications.specification import SpecificationType, SpecificationVersion METS: str = 'METS.xml' @@ -100,14 +100,14 @@ def validate(cls, version: SpecificationVersion, to_validate: Path) -> Validatio results = csip_profile.get_all_results() package: InformationPackage = InformationPackages.from_path(to_validate) - if package.package.oaispackagetype in ['SIP', 'DIP']: - profile = SC.ValidationProfile(SpecificationType.from_string(package.package.oaispackagetype), version) + if package.details.oaispackagetype in ['SIP', 'DIP']: + profile = SC.ValidationProfile(SpecificationType.from_string(package.details.oaispackagetype), version) profile.validate(to_validate.joinpath(METS)) results.extend(profile.get_all_results()) - metadata: MetatdataResults = MetatdataResults.model_validate({ - 'schema_results': validator.validation_errors, - 'schematron_results': results + metadata: MetatdataResultSet = MetatdataResultSet.model_validate({ + 'schema_results': MetadataResults.model_validate({ 'status': _validity_from_messages(validator.validation_errors), 'messages': validator.validation_errors }), + 'schematron_results': MetadataResults.model_validate({ 'status': _validity_from_messages(results), 'messages': results }) }) return ValidationReport.model_validate({ 'structure': struct_results, @@ -115,6 +115,9 @@ def validate(cls, version: SpecificationVersion, to_validate: Path) -> Validatio 'metadata': metadata }) +def _validity_from_messages(messages: list[Result]) -> MetadataStatus: + return MetadataStatus.VALID if len([ res for res in messages if res.severity == Severity.ERROR]) == 0 else MetadataStatus.INVALID + def _report_from_bad_path(package_path: Path) -> ValidationReport: struct_results = structure.get_bad_path_results(package_path) return ValidationReport.model_validate({ 'structure': struct_results }) diff --git a/tests/ips_test.py b/tests/ips_test.py index b3849da..e7d7592 100644 --- a/tests/ips_test.py +++ b/tests/ips_test.py @@ -86,7 +86,7 @@ def test_from_path_dir_no_mets(self): def test_from_path_dir(self): ip: InformationPackage = InformationPackages.from_path(Path(files(UNPACKED).joinpath('733dc055-34be-4260-85c7-5549a7083031'))) - self.assertEqual(ip.name, '733dc055-34be-4260-85c7-5549a7083031') + self.assertEqual(ip.details.name, '733dc055-34be-4260-85c7-5549a7083031') class SchemaTest(unittest.TestCase): def test_schema(self): From 3d3ce76b78ef7f9a480ddfb75c2be758c82d4290 Mon Sep 17 00:00:00 2001 From: Carl Wilson Date: Thu, 29 Aug 2024 10:24:25 +0100 Subject: [PATCH 6/7] FIX: Support commons-ip use of NOTVALID - added `convert_status` validator to `MetadataResults` class to convert commons-ip `NOTVALID` status to `INVALID`; - moved checksum algorithm ID validation to the `PackageDetails` class; - added test and test data for status conversion; and - fixed type hinting for validation methods to `Any`. --- eark_validator/model/package_details.py | 38 +++++----- eark_validator/model/validation_report.py | 13 +++- tests/resources/json/commons-ip-invalid.json | 79 ++++++++++++++++++++ tests/rules_test.py | 8 ++ 4 files changed, 119 insertions(+), 19 deletions(-) create mode 100644 tests/resources/json/commons-ip-invalid.json diff --git a/eark_validator/model/package_details.py b/eark_validator/model/package_details.py index 23ca48a..84aaeb8 100644 --- a/eark_validator/model/package_details.py +++ b/eark_validator/model/package_details.py @@ -43,6 +43,25 @@ class PackageDetails(BaseModel): contentinformationtype: str = '' checksums: List[Checksum] = [] + # Validator to add a hyphen to the SHA checksum algorithm IDs generated by commons-ip + @model_validator(mode='before') + @classmethod + def convert_checksum_ids(cls, data: Any) -> Any: + incoming_checksums = data.get('checksums', []) + if isinstance(incoming_checksums, list): + # If the details are a dict type then it's a commons-ip set + checksums : list[Checksum] = [] + # Loop through the checksums + for checksum in incoming_checksums: + alg_name = checksum.get('algorithm') + if alg_name and alg_name.startswith('SHA') and '-' not in alg_name: + # If it's a SHA checksum alg ID without a hyphen, add one + alg_name = f'{alg_name[:3]}-{alg_name[3:]}' + checksums.append(Checksum(algorithm=alg_name, value=checksum.get('value'))) + data['checksums'] = checksums + # Return the reps for further validation. + return data + class Representation(BaseModel): mets: Optional[MetsFile] = None name: Optional[str] = '' @@ -52,9 +71,10 @@ class InformationPackage(BaseModel): details: Optional[PackageDetails] = None representations: List[Representation] = [] + # Validator to convert the commons-ip representations dict to a list of representations @model_validator(mode='before') @classmethod - def convert_representations_dict(cls, data: Any) -> list[Representation]: + def convert_representations_dict(cls, data: Any) -> Any: representations = data.get('representations') if isinstance(representations, dict): # If this is a dict type then it's a commons-ip type, coerce to list @@ -64,19 +84,3 @@ def convert_representations_dict(cls, data: Any) -> list[Representation]: data['representations'] = reps # Return the reps for further validation. return data - - @model_validator(mode='before') - @classmethod - def convert_checksum_ids(cls, data: Any) -> list[Representation]: - details = data.get('details', {}) - if isinstance(details, dict): - incoming_checksums = details.get('checksums', []) - checksums : list[Checksum] = [] - for checksum in incoming_checksums: - alg_name = checksum.get('algorithm') - if alg_name and alg_name.startswith('SHA') and '-' not in alg_name: - alg_name = f'{alg_name[:3]}-{alg_name[3:]}' - checksums.append(Checksum(algorithm=alg_name, value=checksum.get('value'))) - data['details']['checksums'] = checksums - # Return the reps for further validation. - return data diff --git a/eark_validator/model/validation_report.py b/eark_validator/model/validation_report.py index 05aa2a2..5ff102d 100644 --- a/eark_validator/model/validation_report.py +++ b/eark_validator/model/validation_report.py @@ -30,10 +30,10 @@ """ from enum import Enum, unique -from typing import List, Optional +from typing import Any, List, Optional import uuid -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator from .package_details import InformationPackage from .specifications import Level @@ -123,6 +123,15 @@ class MetadataResults(BaseModel): status: MetadataStatus = MetadataStatus.UNKNOWN messages: List[Result] = [] + # Validator to convert commons-ip status from NOTVALID to INVALID + @model_validator(mode='before') + @classmethod + def convert_status(cls, data: Any) -> Any: + status = data.get('status') + if status and status == 'NOTVALID': + data['status'] = 'INVALID' + return data + class MetatdataResultSet(BaseModel): model_config = ConfigDict(populate_by_name=True) schema_results: MetadataResults = Field(validation_alias='schemaResults') diff --git a/tests/resources/json/commons-ip-invalid.json b/tests/resources/json/commons-ip-invalid.json new file mode 100644 index 0000000..f60419d --- /dev/null +++ b/tests/resources/json/commons-ip-invalid.json @@ -0,0 +1,79 @@ +{ + "package" : { + "details" : { + "name" : "minimal_IP_with_schemas.zip", + "checksums" : [ { + "algorithm" : "SHA1", + "value" : "54BBE654FE332B51569BAF21338BC811CAD2AF66" + } ] + }, + "profile" : { + "type" : "CSIP", + "name" : "minimal_IP_with_schemas.zip", + "version" : "CSIP-" + }, + "representations" : { + "name" : "minimal_IP_with_schemas.zip" + } + }, + "uid" : "e11da9f2-e59e-4f23-b7f8-5ac86632231f", + "structure" : { + "status" : "WellFormed", + "messages" : [ { + "ruleId" : "CSIPSTR7", + "location" : "", + "message" : "If descriptive metadata are available should include inside metadata/descriptive ", + "severity" : "Warn" + }, { + "ruleId" : "CSIPSTR8", + "location" : "", + "message" : "If any other metadata are available, they MAY be included in separate sub-folders, for example an additional folder named other. ", + "severity" : "Info" + }, { + "ruleId" : "CSIPSTR12", + "location" : "", + "message" : "The recommended best practice is to always have a METS.xml in the representation folder. ", + "severity" : "Warn" + }, { + "ruleId" : "CSIPSTR13", + "location" : "", + "message" : "The representation folder SHOULD include a sub-folder named metadata which MAY include all metadata about the specific representation. ", + "severity" : "Warn" + }, { + "ruleId" : "CSIPSTR16", + "location" : "", + "message" : "There is no documentation folder in the representation folder minimal_IP_with_schemas/representations/rep1. ", + "severity" : "Warn" + } ] + }, + "metadata" : { + "schemaResults" : { + "status" : "VALID", + "messages" : [ ] + }, + "schematronResults" : { + "status" : "NOTVALID", + "messages" : [ { + "ruleId" : "CSIP31", + "location" : "mets/amdSec", + "message" : "You have administrative files in the metadata/folder, you must have mets/amdSec in Root METS.xml ", + "severity" : "Warn" + }, { + "ruleId" : "CSIP45", + "location" : "mets/amdSec/rightsMD", + "message" : "Individual representations should state their specific rights in their representation METS file (Root METS.xml) ", + "severity" : "Info" + }, { + "ruleId" : "CSIP66", + "location" : "mets/fileSec/fileGrp/file", + "message" : "You have files in SIP that are not referenced in Root METS.xml ", + "severity" : "Error" + }, { + "ruleId" : "CSIP80", + "location" : "mets/structMap", + "message" : "Must have one structMap with the mets/structMap[@LABEL='CSIP'] in Root METS.xml doens't appear mets/structMap[@LABEL='CSIP']. ", + "severity" : "Error" + } ] + } + } +} diff --git a/tests/rules_test.py b/tests/rules_test.py index 3d3f852..0de3b04 100644 --- a/tests/rules_test.py +++ b/tests/rules_test.py @@ -41,6 +41,7 @@ TEST_RES_XML = TEST_RES + '.xml' TEST_RES_JSON = TEST_RES + '.json' COMMONS_IP_JSON = str(files(JSON).joinpath('commons-ip-report.json')) +COMMONS_IP_INVALID_JSON = str(files(JSON).joinpath('commons-ip-invalid.json')) PERSON_PATH = str(files(SCHEMATRON).joinpath('person.xml')) NOT_FOUND_PATH = str(files(SCHEMATRON).joinpath('not-found.xml')) EMPTY_FILE_PATH = str(files(TEST_RES).joinpath('empty.file')) @@ -235,6 +236,13 @@ def test_deserialise_commons_ip_report(self): result: ValidationReport = ValidationReport.model_validate_json(contents) self.assertIsNotNone(result) + def test_deserialise_commons_ip_invalid(self): + file_name = COMMONS_IP_INVALID_JSON + with open(file_name, 'r', encoding='utf-8') as _f: + contents = _f.read() + result: ValidationReport = ValidationReport.model_validate_json(contents) + self.assertIsNotNone(result) + class SeverityTest(str, Enum): NOT_SEV = 'NOT_SEV' From 1c65fe9d452a42c9f675479292932d540ebbe64f Mon Sep 17 00:00:00 2001 From: Carl Wilson Date: Thu, 29 Aug 2024 13:34:21 +0100 Subject: [PATCH 7/7] REV: Tidier use of ROOT in structure.py As suggested in this [review comment](https://github.com/E-ARK-Software/eark-validator/pull/60#discussion_r1735205478), thanks. --- eark_validator/structure.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/eark_validator/structure.py b/eark_validator/structure.py index 04e4f6b..0d78d59 100644 --- a/eark_validator/structure.py +++ b/eark_validator/structure.py @@ -39,7 +39,7 @@ METS_NAME = 'METS.xml' STR_REQ_PREFIX = 'CSIPSTR' -ROOT = 'root ' +ROOT = 'root' DIR_NAMES = { 'DATA': 'data', 'DESC': 'descriptive', @@ -154,7 +154,7 @@ def get_representations(self) -> List[Representation]: def get_root_results(self) -> List[Result]: results: List[Result] = [] - location: str = ROOT + self.name + location: str = _root_loc(self.name) if not self.parser.is_archive: results.append(test_result_from_id(3, location)) if not self.parser.has_mets(): @@ -195,13 +195,13 @@ def _get_schema_results(self) -> Optional[Result]: for tests in self.representations.values(): if tests.has_schemas(): return None - return test_result_from_id(15, ROOT + self.name) + return test_result_from_id(15, _root_loc(self.name)) def _get_dox_results(self) -> Optional[Result]: for tests in self.representations.values(): if tests.has_documentation(): return None - return test_result_from_id(16, ROOT + self.name) + return test_result_from_id(16, _root_loc(self.name)) @classmethod def get_status(cls, results: List[Result]) -> StructureStatus: @@ -240,7 +240,10 @@ def get_bad_path_results(path) -> StructResults: }) def _get_str1_result_list(name: str) -> List[Result]: - return [ test_result_from_id(1, ROOT + str(name)) ] + return [ test_result_from_id(1, _root_loc(name)) ] + +def _root_loc(name: str) -> str: + return f'{ROOT} {name}' def validate(to_validate) -> Tuple[bool, StructResults]: try: