diff --git a/.gitignore b/.gitignore index 0d5b6cdd..4883f9c6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,6 @@ build/ dist/ *-output.ipynb .vscode/ +.idea/ *.code-workspace **/__pycache__ diff --git a/erddapy/__init__.py b/erddapy/__init__.py index a970f37d..61656549 100644 --- a/erddapy/__init__.py +++ b/erddapy/__init__.py @@ -1,9 +1,17 @@ """Easier access to scientific data.""" +from erddapy.array_like import ERDDAPConnection, ERDDAPServer, GridDataset, TableDataset from erddapy.erddapy import ERDDAP from erddapy.servers.servers import servers -__all__ = ["ERDDAP", "servers"] +__all__ = [ + "ERDDAP", + "servers", + "ERDDAPConnection", + "ERDDAPServer", + "TableDataset", + "GridDataset", +] try: from ._version import __version__ diff --git a/erddapy/array_like/__init__.py b/erddapy/array_like/__init__.py new file mode 100644 index 00000000..f05c836e --- /dev/null +++ b/erddapy/array_like/__init__.py @@ -0,0 +1,19 @@ +""" +This module contains opinionated, higher-level objects for searching servers and accessing datasets. + +It is named 'objects' after object-relational mapping, which is the concept of having an object-oriented +layer between a database (in this case, ERDDAP), and the programming language. +""" + + +from .connection import ERDDAPConnection +from .datasets import ERDDAPDataset, GridDataset, TableDataset +from .server import ERDDAPServer + +__all__ = [ + "ERDDAPDataset", + "ERDDAPConnection", + "ERDDAPServer", + "TableDataset", + "GridDataset", +] diff --git a/erddapy/array_like/connection.py b/erddapy/array_like/connection.py new file mode 100644 index 00000000..eb73227d --- /dev/null +++ b/erddapy/array_like/connection.py @@ -0,0 +1,60 @@ +"""Class ERDDAPConnection to represent connection to a particular URL.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Union + +StrLike = Union[str, bytes] +FilePath = Union[str, Path] + + +class ERDDAPConnection: + """ + Manages connection that will be used in ERDDAPServer instances. + + While most ERDDAP servers allow connections via a bare url, some servers may require authentication + to access data. + """ + + def __init__(self, server: str): + """Initialize instance of ERDDAPConnection.""" + self._server = self.to_string(server) + + @classmethod + def to_string(cls, value): + """Convert an instance of ERDDAPConnection to a string.""" + if isinstance(value, str): + return value + elif isinstance(value, cls): + return value.server + else: + raise TypeError( + f"Server must be either a string or an instance of ERDDAPConnection. '{value}' was " + f"passed.", + ) + + def get(self, url_part: str) -> StrLike: + """ + Request data from the server. + + Uses requests by default similar to most of the current erddapy data fetching functionality. + + Can be overridden to use httpx, and potentially aiohttp or other async functionality, which could + hopefully make anything else async compatible. + """ + pass + + def open(self, url_part: str) -> FilePath: + """Yield file-like object for access for file types that don't enjoy getting passed a string.""" + pass + + @property + def server(self) -> str: + """Access the private ._server attribute.""" + return self._server + + @server.setter + def server(self, value: str): + """Set private ._server attribute.""" + self._server = self.to_string(value) diff --git a/erddapy/array_like/datasets.py b/erddapy/array_like/datasets.py new file mode 100644 index 00000000..6089da3c --- /dev/null +++ b/erddapy/array_like/datasets.py @@ -0,0 +1,106 @@ +"""Classes to represent ERDDAP datasets.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Union + +from erddapy.array_like.connection import ERDDAPConnection + +StrLike = Union[str, bytes] +FilePath = Union[str, Path] + + +class ERDDAPDataset: + """Base class for more focused table or grid datasets.""" + + def __init__( + self, + dataset_id: str, + connection: str | ERDDAPConnection, + variables, + constraints, + ): + """Initialize instance of ERDDAPDataset.""" + self.dataset_id = dataset_id + self._connection = ERDDAPConnection(ERDDAPConnection.to_string(connection)) + self._variables = variables + self._constraints = constraints + self._meta = None + + @property + def connection(self) -> ERDDAPConnection: + """Access private ._connection variable.""" + return self._connection + + @connection.setter + def connection(self, value: str | ERDDAPConnection): + """Set private ._connection variable.""" + self._connection = ERDDAPConnection(ERDDAPConnection.to_string(value)) + + def get(self, file_type: str) -> StrLike: + """Request data using underlying connection.""" + return self.connection.get(file_type) + + def open(self, file_type: str) -> FilePath: + """Download and open dataset using underlying connection.""" + return self.connection.open(file_type) + + def get_meta(self): + """Request dataset metadata from the server.""" + self._meta = None + + @property + def meta(self): + """Access private ._meta attribute. Request metadata if ._meta is empty.""" + return self.get_meta() if (self._meta is None) else self._meta + + @property + def variables(self): + """Access private ._variables attribute.""" + return self._variables + + @property + def constraints(self): + """Access private ._constraints attribute.""" + return self._constraints + + def url_segment(self, file_type: str) -> str: + """Return URL segment without the base URL (the portion after 'https://server.com/erddap/').""" + pass + + def url(self, file_type: str) -> str: + """ + Return a URL constructed using the underlying ERDDAPConnection. + + The URL will contain information regarding the base class server info, the dataset ID, + access method (tabledap/griddap), file type, variables, and constraints. + + This allows ERDDAPDataset subclasses to be used as more opinionated URL constructors while still + not tying users to a specific IO method. + + Not guaranteed to capture all the specifics of formatting a request, such as if a server requires + specific auth or headers. + """ + pass + + def to_dataset(self): + """Open the dataset as xarray dataset by downloading a subset NetCDF.""" + pass + + def opendap_dataset(self): + """Open the full dataset in xarray via OpenDAP.""" + pass + + +class TableDataset(ERDDAPDataset): + """Subclass of ERDDAPDataset specific to TableDAP datasets.""" + + def to_dataframe(self): + """Open the dataset as a Pandas DataFrame.""" + + +class GridDataset(ERDDAPDataset): + """Subclass of ERDDAPDataset specific to GridDAP datasets.""" + + pass diff --git a/erddapy/array_like/server.py b/erddapy/array_like/server.py new file mode 100644 index 00000000..bb18807d --- /dev/null +++ b/erddapy/array_like/server.py @@ -0,0 +1,44 @@ +"""Class ERDDAPServer to represent an ERDDAP server connection.""" + +from __future__ import annotations + +from erddapy.array_like.connection import ERDDAPConnection +from erddapy.array_like.datasets import ERDDAPDataset + + +class ERDDAPServer: + """Instance of an ERDDAP server, with support to ERDDAP's native functionalities.""" + + def __init__(self, url: str, connection: ERDDAPConnection | None): + """Initialize instance of ERDDAPServer.""" + if "http" in url: + self.url = url + else: + # get URL from dict of ERDDAP servers + self._connection = connection or ERDDAPConnection() + + @property + def connection(self) -> ERDDAPConnection: + """Access private ._connection attribute.""" + return self._connection + + @connection.setter + def connection(self, value: str | ERDDAPConnection): + """Set private ._connection attribute.""" + self._connection = value or ERDDAPConnection() + + def full_text_search(self, query: str) -> dict[str, ERDDAPDataset]: + """Search the server with native ERDDAP full text search capabilities.""" + pass + + def search(self, query: str) -> dict[str, ERDDAPDataset]: + """ + Search the server with native ERDDAP full text search capabilities. + + Also see ERDDAPServer.full_text_search. + """ + return self.full_text_search(query) + + def advanced_search(self, **kwargs) -> dict[str, ERDDAPDataset]: + """Search server with ERDDAP advanced search capabilities (may return pre-filtered datasets).""" + pass diff --git a/erddapy/core/interfaces.py b/erddapy/core/interfaces.py new file mode 100644 index 00000000..a4d3eb37 --- /dev/null +++ b/erddapy/core/interfaces.py @@ -0,0 +1,52 @@ +""" +Interface between URL responses and third-party libraries. + +This module takes an URL or the bytes response of a request and converts it to Pandas, +XArray, Iris, etc. objects. +""" + +import iris +import pandas as pd +import xarray as xr +from netCDF4 import Dataset as ncDataset + +from erddapy.core.netcdf import _nc_dataset, _tempnc +from erddapy.core.url import urlopen + + +def to_pandas(url: str, requests_kwargs=dict(), **kw) -> pd.DataFrame: + """Convert a URL to Pandas DataFrame.""" + data = urlopen(url, **requests_kwargs) + try: + return pd.read_csv(data, **kw) + except Exception: + print("Couldn't process response into Pandas DataFrame.") + raise + + +def to_ncCF(url: str, **kw) -> ncDataset: + """Convert a URL to a netCDF4 Dataset.""" + auth = kw.pop("auth", None) + return _nc_dataset(url, auth=auth, **kw) + + +def to_xarray(url: str, response="opendap", **kw) -> xr.Dataset: + """Convert a URL to an xarray dataset.""" + auth = kw.pop("auth", None) + if response == "opendap": + return xr.open_dataset(url, **kw) + else: + nc = _nc_dataset(url, auth=auth, **kw) + return xr.open_dataset(xr.backends.NetCDF4DataStore(nc), **kw) + + +def to_iris(url: str, **kw): + """Convert a URL to an iris CubeList.""" + data = urlopen(url, **kw) + with _tempnc(data) as tmp: + cubes = iris.load_raw(tmp, **kw) + try: + cubes.realise_data() + except ValueError: + _ = [cube.data for cube in cubes] + return cubes diff --git a/erddapy/erddapy.py b/erddapy/erddapy.py index 96506956..a9c6a7f4 100644 --- a/erddapy/erddapy.py +++ b/erddapy/erddapy.py @@ -10,7 +10,7 @@ _griddap_check_variables, _griddap_get_constraints, ) -from erddapy.core.netcdf import _nc_dataset, _tempnc +from erddapy.core.interfaces import to_iris, to_ncCF, to_pandas, to_xarray from erddapy.core.url import ( _check_substrings, _distinct, @@ -344,50 +344,37 @@ def to_pandas(self, **kw): """ response = kw.pop("response", "csvp") url = self.get_download_url(response=response, **kw) - data = urlopen(url, auth=self.auth, **self.requests_kwargs) - return pd.read_csv(data, **kw) + return to_pandas(url, **kw) def to_ncCF(self, **kw): """Load the data request into a Climate and Forecast compliant netCDF4-python object.""" if self.protocol == "griddap": return ValueError("Cannot use ncCF with griddap.") url = self.get_download_url(response="ncCF", **kw) - nc = _nc_dataset(url, auth=self.auth, **self.requests_kwargs) - return nc + return to_ncCF(url, **kw) def to_xarray(self, **kw): """Load the data request into a xarray.Dataset. Accepts any `xr.open_dataset` keyword arguments. """ - import xarray as xr - if self.response == "opendap": - url = self.get_download_url() - return xr.open_dataset(url, **kw) + response = "opendap" + elif self.protocol == "griddap": + response = "nc" else: - response = "nc" if self.protocol == "griddap" else "ncCF" - url = self.get_download_url(response=response) - nc = _nc_dataset(url, auth=self.auth, **self.requests_kwargs) - return xr.open_dataset(xr.backends.NetCDF4DataStore(nc), **kw) + response = "ncCF" + url = self.get_download_url(response=response) + return to_xarray(url, response=response, auth=self.auth, **kw) def to_iris(self, **kw): """Load the data request into an iris.CubeList. Accepts any `iris.load_raw` keyword arguments. """ - import iris - response = "nc" if self.protocol == "griddap" else "ncCF" url = self.get_download_url(response=response, **kw) - data = urlopen(url, auth=self.auth, **self.requests_kwargs) - with _tempnc(data) as tmp: - cubes = iris.load_raw(tmp, **kw) - try: - cubes.realise_data() - except ValueError: - _ = [cube.data for cube in cubes] - return cubes + return to_iris(url, **kw) @functools.lru_cache(maxsize=None) def _get_variables(self, dataset_id: OptionalStr = None) -> Dict: diff --git a/tests/test_erddapy.py b/tests/test_erddapy.py index 3c10cc4c..560d88a0 100644 --- a/tests/test_erddapy.py +++ b/tests/test_erddapy.py @@ -103,13 +103,13 @@ def test_erddap_requests_kwargs(): slowwly_url = f"https://flash-the-slow-api.herokuapp.com/delay/{slowwly_milliseconds}/url/{base_url}" connection = ERDDAP(slowwly_url) - connection.dataset_id = "M01_sbe37_all" + connection.dataset_id = "raw_asset_inventory" connection.protocol = "tabledap" connection.requests_kwargs["timeout"] = timeout_seconds with pytest.raises(httpx.ReadTimeout): - connection.to_xarray() + connection.to_pandas(requests_kwargs=connection.requests_kwargs) @pytest.mark.web