Merge pull request #7 from gregstarr/develop

south hemisphere in progress, improved documentation in progress
gregstarr · Apr 1, 2022 · e5d7053 · e5d7053
2 parents 107d1e4 + 598e13e
commit e5d7053
Show file tree

Hide file tree

Showing 17 changed files with 455 additions and 253 deletions.
diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
@@ -2,13 +2,16 @@ name: Python Package using Conda
 
 on: [push]
 
+env:
+  CODECOV_TOKEN: 5454ef86-3f2b-45a7-8df0-636d3044ae13
+
 jobs:
   build-linux:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.9']
 
     steps:
     - uses: actions/checkout@v2
@@ -38,8 +41,9 @@ jobs:
       run: |
         conda install pytest
         pip install pytest-html
+        pip install pytest-cov
         pip install -e .
-        pytest test --html=${{ matrix.python-version }}-results.html --self-contained-html
+        pytest test --html=${{ matrix.python-version }}-results.html --self-contained-html --cov=./ --cov-report=xml
     - name: Move artifacts
       shell: bash -l {0}
       run: mv test.log ${{ matrix.python-version }}-test.log
@@ -50,4 +54,7 @@ jobs:
         name: ${{ matrix.python-version }}-artifacts
         path: |
           ${{ matrix.python-version }}-results.html
-          ${{ matrix.python-version }}-test.log
+          ${{ matrix.python-version }}-test.log
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v2
+      if: always()
diff --git a/README.md b/README.md
@@ -1,4 +1,61 @@
+# trough
 [![Python Package using Conda](https://github.com/gregstarr/trough/actions/workflows/python-package-conda.yml/badge.svg)](https://github.com/gregstarr/trough/actions/workflows/python-package-conda.yml)
+[![codecov](https://codecov.io/gh/gregstarr/trough/branch/master/graph/badge.svg?token=QNCESQ41EW)](https://codecov.io/gh/gregstarr/trough)
 
-# trough
-mid latitude ionospheric trough research
+![GitHub](https://img.shields.io/github/license/gregstarr/trough)
+![GitHub last commit](https://img.shields.io/github/last-commit/gregstarr/trough?color=blue&style=flat)
+![Lines of code](https://img.shields.io/tokei/lines/github/gregstarr/trough?color=orange)
+![GitHub Repo stars](https://img.shields.io/github/stars/gregstarr/trough?style=social)
+
+### Example
+
+![Example](example.png)
+
+### Features
+- Download Madrigal TEC, OMNI and DMSP SSUSI data
+- Process datasets into more convenient `xarray` data structures and save as NetCDF
+- Automatically label main ionospheric trough
+
+# Usage
+
+1. Clone Repo
+2. create conda environment using `environment.yml` (if you have trouble with apexpy, install it first)
+3. install trough with `pip install -e .`
+4. copy `config.json.example` --> `config.json` and change any options you want
+5. run with `python -m trough config.json`
+6. wait for it to finish (can take several days if you are running 5+ years)
+7. add `import trough` in your code and access the data using `trough.get_data`
+
+### Config
+#### Main Options
+| Config Option | Definition                                                                                                              |
+| --- |-------------------------------------------------------------------------------------------------------------------------|
+| base_dir | base directory of trough downloads and processing, directories for downloading and processing will be created from here |
+| madrigal_user_name | name supplied to MadrigalWeb                                                                                            |
+| madrigal_user_email | email supplied to MadrigalWeb                                                                                           |
+| madrigal_user_affil | affiliation supplied to MadrigalWeb                                                                                     |
+| nasa_spdf_download_method | "http" or "ftp" (default)                                                                                               |
+| lat_res | latitude resolution of processed TEC maps (degrees Apex magnetic latitude)                                              |
+| lon_res | longitude resolution of processed TEC maps (degrees Apex magnetic longitude)                                            |
+| time_res_unit | time resolution units (passed to `np.timedelta64`)                                                                      |
+| time_res_n | time resolution in units specified above (passed to `np.timedelta64`)                                                   |
+| script_name | which script to run, available scripts are in `trough/scripts.py`                                                       |
+| start_date | start date of interval (YYYYMMDD, YYYYMMDD_hh, YYYYMMDD_hhmm, or YYYYMMDD_hhmmss)                                       |
+| end_date | end date of interval, see "start_date" for format                                                                       |
+| keep_download | whether or not to keep the downloaded files (not recommended)                                                           |
+| trough_id_params | trough labeling algorithm parameters, see below                                                                         |
+
+#### Trough Labeling Options
+| Config Option | Definition                                                                          |
+| --- |-------------------------------------------------------------------------------------|
+| bg_est_shape | background estimation filter size in pixels [time, latitude, longitude]             | 
+| model_weight_max | maximum value of L2 regularization before multiplication by coefficient `l2_weight` |
+| rbf_bw | RBF bandwidth, number of pixels to half weight                                      |
+| tv_hw | total variation horizontal weight                                                   |
+| tv_vw | total variation vertical weight                                                     |
+| l2_weight | L2 regularization coefficient                                                       |
+| tv_weight | TV regularization coefficient                                                       |
+| perimeter_th | minimum perimeter for a connected component in a label image                        |
+| area_th | minimum area for a connected component in a label image                             |
+| threshold | score threshold below which a pixel is not labeled as MIT                           |
+| closing_rad | radius for disk structuring element passed to `skimage.morphology.binary_closing`   |
diff --git a/config.json.example b/config.json.example
@@ -0,0 +1,32 @@
+{
+  "base_dir": "path/to/trough_directory",
+  "trough_id_params": {
+    "bg_est_shape": [
+      1,
+      19,
+      17
+    ],
+    "model_weight_max": 15,
+    "rbf_bw": 1,
+    "tv_hw": 2,
+    "tv_vw": 1,
+    "l2_weight": 0.09,
+    "tv_weight": 0.15,
+    "perimeter_th": 30,
+    "area_th": 30,
+    "threshold": 1,
+    "closing_rad": 0
+  },
+  "madrigal_user_name": "your_name",
+  "madrigal_user_email": "[email protected]",
+  "madrigal_user_affil": "your_affiliation",
+  "nasa_spdf_download_method": "ftp",
+  "lat_res": 1,
+  "lon_res": 2,
+  "time_res_unit": "h",
+  "time_res_n": 1,
+  "script_name": "full_run",
+  "start_date": "20100101",
+  "end_date": "20220101",
+  "keep_download": false
+}
diff --git a/environment.yml b/environment.yml
@@ -2,17 +2,17 @@ channels:
   - anaconda
   - conda-forge
 dependencies:
-   - numpy
-   - scipy
-   - h5py
-   - scikit-image
-   - scikit-learn
-   - appdirs
-   - bs4
-   - pandas
-   - xarray
-   - bottleneck
-   - cvxpy
+   - numpy==1.21.2
+   - scipy==1.7.3
+   - h5py==3.6.0
+   - scikit-image==0.19.1
+   - scikit-learn==1.0.2
+   - appdirs==1.4.4
+   - bs4==4.10.0
+   - pandas==1.3.5
+   - xarray==0.20.1
+   - bottleneck==1.3.2
+   - cvxpy==1.1.18
    - pip
    - pip:
-      - madrigalWeb
+      - madrigalWeb==3.2
diff --git a/example.png b/example.png
diff --git a/test/subtest.py b/test/subtest.py
@@ -3,7 +3,7 @@
 
 start_date = datetime(2020, 9, 8, 9)
 end_date = datetime(2020, 9, 9, 12)
-data = trough.get_data(start_date, end_date)
+data = trough.get_data(start_date, end_date, 'north')
 print(data['tec'].shape)
 print(data['kp'].shape)
 print(data['labels'].shape)

diff --git a/test/test_arb.py b/test/test_arb.py
@@ -14,13 +14,13 @@
 
 
 def test_file_list():
-    start_date = datetime(2001, 1, 1, 12, 0, 0)
-    end_date = datetime(2001, 1, 2, 12, 0, 0)
+    start_date = datetime(2001, 1, 4, 12, 0, 0)
+    end_date = datetime(2001, 1, 5, 12, 0, 0)
     with TemporaryDirectory() as tempdir:
         cache_fn = Path(tempdir) / "file_list.json"
         cache = {}
         for sat in ['f16', 'f17', 'f18', 'f19']:
-            for doy in [1, 2]:
+            for doy in [3, 4, 5]:
                 cache_key = f"{sat}_{2001}_{doy}"
                 cache[cache_key] = [f'{cache_key}_file_1', f'{cache_key}_file_2']
                 with open(cache_fn, 'w') as f:
@@ -75,52 +75,57 @@ def test_download_arb(test_dates, download_dir):
 )
 def test_process_arb(download_dir, processed_dir, test_dates, dt, mlt_vals):
     start, end = test_dates
-    correct_times = np.arange(np.datetime64(start, 's'), np.datetime64(end, 's'), dt)
+    correct_times = np.arange(np.datetime64(start, 's'), np.datetime64(end, 's') + dt, dt)
     processed_file = Path(processed_dir) / 'arb_test.nc'
-    process_interval(start, end, processed_file, download_dir, mlt_vals, dt)
-    assert processed_file.exists()
-    data = xr.open_dataarray(processed_file)
-    assert data.shape == (correct_times.shape[0], mlt_vals.shape[0])
-    assert (data.mlt == mlt_vals).all().item()
-    assert (data.time == correct_times).all().item()
+    for hemisphere in ['north', 'south']:
+        process_interval(start, end, hemisphere, processed_file, download_dir, mlt_vals, dt)
+        assert processed_file.exists()
+        data = xr.open_dataarray(processed_file)
+        data.load()
+        assert data.shape == (correct_times.shape[0], mlt_vals.shape[0])
+        assert (data.mlt == mlt_vals).all().item()
+        assert (data.time == correct_times).all().item()
+        data.close()
+        processed_file.unlink()
 
 
 def test_process_arb_out_of_range(download_dir, processed_dir, test_dates):
     dt = np.timedelta64(1, 'h')
     start, end = [date - timedelta(days=100) for date in test_dates]
     processed_file = Path(processed_dir) / 'arb_test.nc'
     with pytest.raises(InvalidProcessDates):
-        process_interval(start, end, processed_file, download_dir, config.get_mlt_vals(), dt)
+        process_interval(start, end, 'north', processed_file, download_dir, config.get_mlt_vals(), dt)
 
 
 def test_get_arb_data(download_dir, processed_dir, test_dates):
     start, end = test_dates
     dt = np.timedelta64(1, 'h')
     mlt = config.get_mlt_vals()
-    correct_times = np.arange(np.datetime64(start), np.datetime64(end), dt)
-    processed_file = get_arb_paths(start, end, processed_dir)[0]
-    process_interval(start, end, processed_file, download_dir, mlt, dt)
-    data = get_arb_data(start, end, processed_dir)
+    correct_times = np.arange(np.datetime64(start), np.datetime64(end) + dt, dt)
+    processed_file = get_arb_paths(start, end, 'north', processed_dir)[0]
+    process_interval(start, end, 'north', processed_file, download_dir, mlt, dt)
+    data = get_arb_data(start, end, 'north', processed_dir)
     assert data.shape == (correct_times.shape[0], mlt.shape[0])
     assert (data.mlt == mlt).all().item()
     assert (data.time == correct_times).all().item()
 
 
 def test_scripts(test_dates):
+    start, end = test_dates
     with TemporaryDirectory() as base_dir:
         with config.temp_config(base_dir=base_dir) as cfg:
-            scripts.download_arb(*test_dates)
+            scripts.download_arb(start, end)
             arb_files = list(Path(cfg.download_arb_dir).glob('*'))
             assert len(arb_files) > 0
-            data, times = _get_downloaded_arb_data(*test_dates, cfg.download_arb_dir)
-            assert min(times) < test_dates[0]
-            assert max(times) > test_dates[-1]
-            scripts.process_arb(*test_dates)
-            data = get_arb_data(*test_dates, cfg.processed_arb_dir)
+            data, times = _get_downloaded_arb_data(start, end, cfg.download_arb_dir)
+            assert min(times) < start
+            assert max(times) > end
+            scripts.process_arb(start, end)
+            data = get_arb_data(start, end, 'north', cfg.processed_arb_dir)
             data.load()
             dt = np.timedelta64(1, 'h')
             mlt = config.get_mlt_vals()
-            correct_times = np.arange(np.datetime64(test_dates[0]), np.datetime64(test_dates[-1]), dt)
+            correct_times = np.arange(np.datetime64(test_dates[0]), np.datetime64(test_dates[-1]) + dt, dt)
             assert data.shape == (correct_times.shape[0], mlt.shape[0])
             assert (data.mlt == mlt).all().item()
             assert (data.time == correct_times).all().item()