From 8d1b96410a1af99edc000eec35f066b337b17809 Mon Sep 17 00:00:00 2001
From: Bill Arnold <arnoldw@anlgov>
Date: Wed, 29 Nov 2023 14:48:56 -0500
Subject: [PATCH 1/6] WIP conversion to R 2.0.3

---
 .../cerebras/customizing-environment.md       | 45 +++------
 docs/ai-testbed/cerebras/example-programs.md  | 43 ++++----
 .../cerebras/job-queuing-and-submission.md    | 12 +--
 .../cerebras/running-a-model-or-program.md    | 97 +++++--------------
 4 files changed, 64 insertions(+), 133 deletions(-)

diff --git a/docs/ai-testbed/cerebras/customizing-environment.md b/docs/ai-testbed/cerebras/customizing-environment.md
index b8b7a564d..c4ff54c30 100644
--- a/docs/ai-testbed/cerebras/customizing-environment.md
+++ b/docs/ai-testbed/cerebras/customizing-environment.md
@@ -7,49 +7,28 @@
 ```console
 #Make your home directory navigable
 chmod a+xr ~/
-mkdir ~/R_1.9.2
-chmod a+x ~/R_1.9.2/
-cd ~/R_1.9.2
+mkdir ~/R_2.0.3
+chmod a+x ~/R_2.0.3/
+cd ~/R_2.0.3
 # Note: "deactivate" does not actually work in scripts.
 deactivate
-rm -r venv_pt
-/software/cerebras/python3.8/bin/python3.8 -m venv venv_pt
-source venv_pt/bin/activate
-pip3 install /opt/cerebras/wheels/cerebras_pytorch-1.9.2+92b4fad15b-cp38-cp38-linux_x86_64.whl --find-links=/opt/cerebras/wheels
-pip install numpy==1.23.4
-pip install datasets transformers
+rm -r venv_cerebras_pt
+/software/cerebras/python3.8/bin/python3.8 -m venv venv_cerebras_pt
+source venv_cerebras_pt/bin/activate
+pip install --upgrade pip
+pip install cerebras_pytorch==2.0.2
 ```
 
+<!--- No longer any TensorFlow wheel
 #### To make a TensorFlow virtual environment for Cerebras
-
-```console
-chmod a+xr ~/
-mkdir ~/R_1.9.2
-chmod a+x ~/R_1.9.2/
-cd ~/R_1.9.2
-# Note: "deactivate" does not actually work in scripts.
-deactivate
-rm -r venv_tf
-/software/cerebras/python3.8/bin/python3.8 -m venv venv_tf
-source venv_tf/bin/activate
-#pip install tensorflow_datasets
-#pip install spacy
-pip3 install /opt/cerebras/wheels/cerebras_tensorflow-1.9.2+92b4fad15b-cp38-cp38-linux_x86_64.whl --find-links=/opt/cerebras/wheels/
-pip install numpy==1.23.4
-```
+--->
 
 #### Activation and deactivation
 
-To activate one of these virtual environments,
-
-```console
-source ~/R_1.9.2/venv_pt/bin/activate
-```
-
-or
+To activate a virtual environments
 
 ```console
-source ~/R_1.9.2/venv_tf/bin/activate
+source ~/R_2.0.3/venv_cerebras_pt/bin/activate
 ```
 
 To deactivate a virtual environment,
diff --git a/docs/ai-testbed/cerebras/example-programs.md b/docs/ai-testbed/cerebras/example-programs.md
index a8380b655..f39d8043d 100644
--- a/docs/ai-testbed/cerebras/example-programs.md
+++ b/docs/ai-testbed/cerebras/example-programs.md
@@ -4,12 +4,15 @@
 Make a working directory and a local copy of the Cerebras **modelzoo** and **anl_shared** repository, if not previously done, as follows.
 
 ```bash
-mkdir ~/R_1.9.2
-cd ~/R_1.9.2
+mkdir ~/R_2.0.3
+cd ~/R_2.0.3
 git clone https://github.com/Cerebras/modelzoo.git
+cd modelzoo
+git tag
+git checkout Release_2.0.3
 ```
 <!---
-cp -r /software/cerebras/model_zoo/anl_shared/ ~/R_1.9.2/anl_shared
+cp -r /software/cerebras/model_zoo/anl_shared/ ~/R_2.0.3/anl_shared
 --->
 
 ## UNet
@@ -19,17 +22,17 @@ To run Unet with the <a href="https://www.kaggle.com/c/severstal-steel-defect-de
 First, source a Cerebras PyTorch virtual environment.
 
 ```console
-source ~/R_1.9.2/venv_pt/bin/activate
+source ~/R_2.0.3/venv_cerebras_pt/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.2/modelzoo/modelzoo/vision/pytorch/unet
+cd ~/R_2.0.3/modelzoo/modelzoo/vision/pytorch/unet
 cp /software/cerebras/dataset/severstal-steel-defect-detection/params_severstal_binary_rawds.yaml configs/params_severstal_binary_rawds.yaml
 export MODEL_DIR=model_dir_unet
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=unet_pt --params configs/params_severstal_binary_rawds.yaml --model_dir $MODEL_DIR --mode train --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log 
+python run.py CSX --job_labels name=unet_pt --params configs/params_severstal_binary_rawds.yaml --model_dir $MODEL_DIR --mode train --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.0.3/modelzoo/ --compile_dir $(whoami) |& tee mytest.log 
 ```
 
 <!--- Appears to not have been ported to 1.7.1
@@ -43,7 +46,7 @@ The BraggNN model has two versions:<br>
 
 ```console
 TODO
-cd ~/R_1.9.2/anl_shared/braggnn/tf
+cd ~/R_2.0.3/anl_shared/braggnn/tf
 # This yaml has a correct path to a BraggNN dataset
 cp /software/cerebras/dataset/BraggN/params_bragg_nonlocal_sampleds.yaml configs/params_bragg_nonlocal_sampleds.yaml
 export MODEL_DIR=model_dir_braggnn
@@ -59,21 +62,21 @@ This BERT-large msl128 example uses a single sample dataset for both training an
 First, source a Cerebras PyTorch virtual environment.
 
 <!---
-source /software/cerebras/venvs/venv_pt/bin/activate
+source /software/cerebras/venvs/venv_cerebras_pt/bin/activate
 # or your personal venv
 --->
 ```console
-source ~/R_1.9.2/venv_pt/bin/activate
+source ~/R_2.0.3/venv_cerebras_pt/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.2/modelzoo/modelzoo/transformers/pytorch/bert
+cd ~/R_2.0.3/modelzoo/modelzoo/transformers/pytorch/bert
 cp /software/cerebras/dataset/bert_large/bert_large_MSL128_sampleds.yaml configs/bert_large_MSL128_sampleds.yaml
 export MODEL_DIR=model_dir_bert_large_pytorch
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=bert_pt --params configs/bert_large_MSL128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=bert_pt --params configs/bert_large_MSL128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_2.0.3/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following, with messages about cuda that should be ignored and are not shown.
@@ -104,17 +107,17 @@ This BERT-large msl128 example uses a single sample dataset for both training an
 First, source a Cerebras TensorFlow virtual environment.
 
 ```console
-source ~/R_1.9.2/venv_tf/bin/activate
+source ~/R_2.0.3/venv_tf/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.2/modelzoo/modelzoo/transformers/tf/bert
+cd ~/R_2.0.3/modelzoo/modelzoo/transformers/tf/bert
 cp /software/cerebras/dataset/bert_large/params_bert_large_msl128_sampleds.yaml configs/params_bert_large_msl128_sampleds.yaml
 export MODEL_DIR=mytest
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=bert_tf --max_steps 1000 --params configs/params_bert_large_msl128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=bert_tf --max_steps 1000 --params configs/params_bert_large_msl128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_2.0.3/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following, with messages about cuda that should be ignored and are not shown.
@@ -147,17 +150,17 @@ This PyTorch GPT-J 6B parameter pretraining sample uses 2 CS2s.
 First, source a Cerebras PyTorch virtual environment.
 
 ```console
-source ~/R_1.9.2/venv_pt/bin/activate
+source ~/R_2.0.3/venv_cerebras_pt/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.2/modelzoo/modelzoo/transformers/pytorch/gptj
+cd ~/R_2.0.3/modelzoo/modelzoo/transformers/pytorch/gptj
 cp /software/cerebras/dataset/gptj/params_gptj_6B_sampleds.yaml configs/params_gptj_6B_sampleds.yaml
 export MODEL_DIR=model_dir_gptj
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.0.3/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following:
@@ -187,17 +190,17 @@ source /software/cerebras/venvs/venv_tf/bin/activate
 # or your personal venv
 
 ```console
-source ~/R_1.9.2/venv_tf/bin/activate
+source ~/R_2.0.3/venv_tf/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.2/modelzoo/modelzoo/transformers/tf/gptj
+cd ~/R_2.0.3/modelzoo/modelzoo/transformers/tf/gptj
 cp /software/cerebras/dataset/gptj/params_gptj_6B_tf_sampleds.yaml configs/params_gptj_6B_sampleds.yaml
 export MODEL_DIR=model_dir_gptj_tf
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=gptj_tf --max_steps 500 --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=gptj_tf --max_steps 500 --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_2.0.3/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following:
diff --git a/docs/ai-testbed/cerebras/job-queuing-and-submission.md b/docs/ai-testbed/cerebras/job-queuing-and-submission.md
index 9de47d109..3e7343778 100644
--- a/docs/ai-testbed/cerebras/job-queuing-and-submission.md
+++ b/docs/ai-testbed/cerebras/job-queuing-and-submission.md
@@ -8,10 +8,10 @@ Continuous job status for a job is output to stdout/stderr; redirect the output,
 Jobs that have not yet completed can be listed as shown. Note: this command can take over a minute to complete.
 
 ```console
-(venv_pt) $ csctl get jobs
+(venv_cerebras_pt) $ csctl get jobs
 NAME                          AGE  DURATION  PHASE    SYSTEMS     USER     LABELS        DASHBOARD
 wsjob-thjj8zticwsylhppkbmjqe  13s  1s        RUNNING  cer-cs2-01  username name=unet_pt  https://grafana.cerebras1.lab.alcf.anl.gov/d/WebHNShVz/wsjob-dashboard?orgId=1&var-wsjob=wsjob-thjj8zticwsylhppkbmjqe&from=1691705374000&to=now
-(venv_pt) $
+(venv_cerebras_pt) $
 ```
 To view the grafana databoard for a job, follow the instructions at [Grafana WsJob Dashboard for Cerebras jobs](./miscellaneous.md#grafana-wsjob-dashboard-for-cerebras-jobs)
 
@@ -30,16 +30,16 @@ Jobs can be labeled in the command line that launches them, if they are written
 
 Jobs can also be labeled after they have been started as shown:
 ```console
-(venv_pt) $ csctl label job wsjob-ez6dyfronnsg2rz7f7fqw4 testlabel=test
+(venv_cerebras_pt) $ csctl label job wsjob-ez6dyfronnsg2rz7f7fqw4 testlabel=test
 job/wsjob-ez6dyfronnsg2rz7f7fqw4 was patched
-(venv_pt) $
+(venv_cerebras_pt) $
 ```
 
 Jobs with a particular label/label value can be listed as shown:
 ```console
-(venv_pt) $ csctl get jobs | grep "testlabel=test"
+(venv_cerebras_pt) $ csctl get jobs | grep "testlabel=test"
 wsjob-ez6dyfronnsg2rz7f7fqw4  19m SUCCEEDED  cer-cs2-02 username testlabel=test,user=username
-(venv_pt) $
+(venv_cerebras_pt) $
 ```
 
 See `csctl -h` for more options.<br>
diff --git a/docs/ai-testbed/cerebras/running-a-model-or-program.md b/docs/ai-testbed/cerebras/running-a-model-or-program.md
index d243f7d80..47fc8bede 100644
--- a/docs/ai-testbed/cerebras/running-a-model-or-program.md
+++ b/docs/ai-testbed/cerebras/running-a-model-or-program.md
@@ -4,12 +4,12 @@
 
 #### Job submission and queuing
 
-Cerebras jobs are initiated and tracked automatically within the Python frameworks in **modelzoo.common.pytorch.run_utils** and **modelzoo.common.tf.run_utils**. These frameworks interact with the Cerebras cluster management node.
+Cerebras jobs are initiated and tracked automatically within the Python framework in **modelzoo.common.pytorch.run_utils**. This framework interacts with the Cerebras cluster management node.
 
 #### Login nodes
 
 Jobs are launched from **login** nodes.
-If you expect a loss of an internet connection for any reason, for long-running jobs we suggest logging into a specific login node and using either **screen** or **tmux** to create persistent command line sessions.  For details use:
+If you expect a loss of an internet connection for any reason, for long-running jobs we suggest logging into a specific login node and using either **screen** or **tmux** to create persistent command line sessions.  For details use:2
 
 ```bash
 man screen
@@ -19,40 +19,36 @@ man tmux
 
 ## Running jobs on the wafer
 
-Follow these instructions to compile and train the `fc_mnist` TensorFlow and PyTorch samples. These models are a couple of fully connected layers plus dropout and RELU. <br>
+Follow these instructions to compile and train the `fc_mnist` PyTorch sample. This models is a couple of fully connected layers plus dropout and RELU. <br>
 
 ### Cerebras virtual environments
 
-First, make virtual environments for Cerebras for PyTorch and/or TensorFlow.
-See [Customizing Environments](./customizing-environment.md) for the procedures for making PyTorch and/or TensorFlow virtual environments for Cerebras.
-If the environments are made in ```~/R_1.9.2/```, then they would be activated as follows:
+First, make a virtual environment for Cerebras for PyTorch.
+See [Customizing Environments](./customizing-environment.md) for the procedures for making PyTorch virtual environments for Cerebras.
+If an environment is made in ```~/R_2.0.3/```, it they would be activated as follows:
 ```console
-source ~/R_1.9.2/venv_pt/bin/activate
-```
-or
-```console
-source ~/R_1.9.2/vent_tf/bin/activate
+source ~/R_2.0.3/venv_cerebras_pt/bin/activate
 ```
 
 ### Clone the Cerebras modelzoo
 
-Note: For virtual environent R_1.9.2, the modelzoo is unchanged from R_1.9.1. 
-
 ```console
-mkdir ~/R_1.9.2
-cd ~/R_1.9.2
+mkdir ~/R_2.0.3
+cd ~/R_2.0.3
 git clone https://github.com/Cerebras/modelzoo.git
 cd modelzoo
 git tag
-git checkout Release_1.9.1
+git checkout Release_2.0.3
 ```
 ## Running a Pytorch sample
 
-### Activate your PyTorch virtual environment, and change to the working directory
+### Activate your PyTorch virtual environment, install modelzoo requirements, and change to the working directory
 
 ```console
-source ~/R_1.9.2/venv_pt/bin/activate
-cd ~/R_1.9.2/modelzoo/modelzoo/fc_mnist/pytorch
+source ~/R_2.0.3/venv_cerebras_pt/bin/activate
+cd ~/R_2.0.3/modelzoo
+pip install -r requirements.txt
+cd ~/R_2.0.3/modelzoo/modelzoo/fc_mnist/pytorch
 ```
 
 Next, edit configs/params.yaml, making the following changes:
@@ -81,65 +77,18 @@ To run the sample:
 export MODEL_DIR=model_dir
 # deletion of the model_dir is only needed if sample has been previously run
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=pt_smoketest --params configs/params.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.2/modelzoo --compile_dir /$(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=pt_smoketest --params configs/params.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.0.3/modelzoo --compile_dir /$(whoami) |& tee mytest.log
 ```
 
 A successful fc_mnist PyTorch training run should finish with output resembling the following:
 
 ```text
-2023-05-15 16:05:54,510 INFO:   | Train Device=xla:0, Step=9950, Loss=2.30234, Rate=157300.30 samples/sec, GlobalRate=26805.42 samples/sec
-2023-05-15 16:05:54,571 INFO:   | Train Device=xla:0, Step=10000, Loss=2.29427, Rate=125599.14 samples/sec, GlobalRate=26905.42 samples/sec
-2023-05-15 16:05:54,572 INFO:   Saving checkpoint at global step 10000
-2023-05-15 16:05:59,734 INFO:   Saving step 10000 in dataloader checkpoint
-2023-05-15 16:06:00,117 INFO:   Saved checkpoint at global step: 10000
-2023-05-15 16:06:00,117 INFO:   Training Complete. Completed 1280000 sample(s) in 53.11996841430664 seconds.
-2023-05-15 16:06:04,356 INFO:   Monitoring returned
-```
-
-<!---
-## Running a TensorFlow sample
-
-### Activate your TensorFlow virtual environment and change to the working directory
-
-```console
-source ~/R_1.9.2/venv_tf/bin/activate
-cd ~/R_1.9.2/modelzoo/modelzoo/fc_mnist/tf/
-```
-
-Next, edit configs/params.yaml, making the following change. Cerebras requires that the data_dir be an absolute path.
-
-```text
---- a/modelzoo/fc_mnist/tf/configs/params.yaml
-+++ b/modelzoo/fc_mnist/tf/configs/params.yaml
-@@ -17,7 +17,7 @@ description: "FC-MNIST base model params"
-
- train_input:
-     shuffle: True
--    data_dir: './tfds' # Place to store data
-+    data_dir: '/software/cerebras/dataset/fc_mnist/tfds/' # Place to store data
-     batch_size: 256
-     num_parallel_calls: 0   # 0 means AUTOTUNE
-```
-
-### Run a sample TensorFlow training job
-
-```console
-export MODEL_DIR=model_dir
-# deletion of the model_dir is only needed if sample has been previously run
-if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX pipeline --job_labels name=tf_fc_mnist --params configs/params.yaml --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir /$(whoami) |& tee mytest.log
-```
-
-A successful fc_mnist TensorFlow training run should finish with output resembling the following:
-
-```text
-INFO:tensorflow:global step 99900: loss = 0.10198974609375 (915.74 steps/sec)
-INFO:tensorflow:global step 100000: loss = 0.0 (915.96 steps/sec)
-INFO:root:Training complete. Completed 25600000 sample(s) in 109.17504906654358 seconds
-INFO:root:Taking final checkpoint at step: 100000
-INFO:root:Saving step 99999 in dataloader checkpoint
-INFO:tensorflow:Saved checkpoint for global step 100000 in 3.9300642013549805 seconds: model_dir/model.ckpt-100000
-INFO:root:Monitoring returned
+2023-11-29 18:13:13,048 INFO:   | Train Device=CSX, Step=1950, Loss=2.28834, Rate=397.31 samples/sec, GlobalRate=433.98 samples/sec
+2023-11-29 18:13:13,555 INFO:   | Train Device=CSX, Step=2000, Loss=2.34778, Rate=395.69 samples/sec, GlobalRate=431.83 samples/sec
+2023-11-29 18:13:13,555 INFO:   Saving checkpoint at step 2000
+2023-11-29 18:13:17,242 INFO:   Saved checkpoint model_dir/checkpoint_2000.mdl
+2023-11-29 18:13:55,517 INFO:   Heartbeat thread stopped for wsjob-fpwqt7maq8a5mxvblwwzbu.
+2023-11-29 18:13:55,523 INFO:   Training completed successfully!
+2023-11-29 18:13:55,523 INFO:   Processed 4000 sample(s) in 51.230697212 seconds.
 ```
---->
 

From 2ab3325d2278ea3cad8a5a4274718c9a7cf3a406 Mon Sep 17 00:00:00 2001
From: Bill Arnold <arnoldw@anlgov>
Date: Wed, 29 Nov 2023 15:35:48 -0500
Subject: [PATCH 2/6] more WIP for R 2.0.3

---
 docs/ai-testbed/cerebras/example-programs.md | 74 +++++---------------
 1 file changed, 16 insertions(+), 58 deletions(-)

diff --git a/docs/ai-testbed/cerebras/example-programs.md b/docs/ai-testbed/cerebras/example-programs.md
index f39d8043d..d1b4f09af 100644
--- a/docs/ai-testbed/cerebras/example-programs.md
+++ b/docs/ai-testbed/cerebras/example-programs.md
@@ -82,66 +82,24 @@ python run.py CSX --job_labels name=bert_pt --params configs/bert_large_MSL128_s
 The last parts of the output should resemble the following, with messages about cuda that should be ignored and are not shown.
 
 ```console
-2023-05-17 18:10:08,776 INFO:   Finished sending initial weights
-2023-05-17 18:15:11,548 INFO:   | Train Device=xla:0, Step=100, Loss=9.46875, Rate=4597.49 samples/sec, GlobalRate=4597.49 samples/sec
-2023-05-17 18:15:23,067 INFO:   | Train Device=xla:0, Step=200, Loss=8.94531, Rate=7173.00 samples/sec, GlobalRate=6060.68 samples/sec
-2023-05-17 18:15:41,547 INFO:   | Train Device=xla:0, Step=300, Loss=8.79688, Rate=6193.85 samples/sec, GlobalRate=5876.98 samples/sec
-2023-05-17 18:15:54,118 INFO:   | Train Device=xla:0, Step=400, Loss=8.28906, Rate=7365.06 samples/sec, GlobalRate=6316.84 samples/sec
-2023-05-17 18:16:12,430 INFO:   | Train Device=xla:0, Step=500, Loss=8.14844, Rate=6301.21 samples/sec, GlobalRate=6157.22 samples/sec
-2023-05-17 18:16:25,177 INFO:   | Train Device=xla:0, Step=600, Loss=8.06250, Rate=7340.44 samples/sec, GlobalRate=6406.58 samples/sec
-2023-05-17 18:16:43,315 INFO:   | Train Device=xla:0, Step=700, Loss=8.00000, Rate=6323.57 samples/sec, GlobalRate=6285.55 samples/sec
-2023-05-17 18:16:56,110 INFO:   | Train Device=xla:0, Step=800, Loss=7.96484, Rate=7331.29 samples/sec, GlobalRate=6458.82 samples/sec
-2023-05-17 18:17:14,564 INFO:   | Train Device=xla:0, Step=900, Loss=7.89844, Rate=6261.77 samples/sec, GlobalRate=6343.22 samples/sec
-2023-05-17 18:17:26,977 INFO:   | Train Device=xla:0, Step=1000, Loss=7.90234, Rate=7454.38 samples/sec, GlobalRate=6493.27 samples/sec
-2023-05-17 18:17:26,978 INFO:   Saving checkpoint at global step 1000
-2023-05-17 18:18:38,485 INFO:   Saving step 1000 in dataloader checkpoint
-2023-05-17 18:18:38,931 INFO:   Saved checkpoint at global step: 1000
-2023-05-17 18:18:38,932 INFO:   Training Complete. Completed 1024000 sample(s) in 229.65675950050354 seconds.
-2023-05-17 18:18:49,293 INFO:   Monitoring returned
+2023-11-29 20:07:49,284 INFO:   Beginning appliance run
+2023-11-29 20:08:14,365 INFO:   | Train Device=CSX, Step=100, Loss=9.50000, Rate=4088.28 samples/sec, GlobalRate=4088.26 samples/sec
+2023-11-29 20:08:39,820 INFO:   | Train Device=CSX, Step=200, Loss=8.37500, Rate=4048.91 samples/sec, GlobalRate=4055.21 samples/sec
+2023-11-29 20:09:05,356 INFO:   | Train Device=CSX, Step=300, Loss=7.96875, Rate=4025.61 samples/sec, GlobalRate=4040.05 samples/sec
+2023-11-29 20:09:30,626 INFO:   | Train Device=CSX, Step=400, Loss=7.56250, Rate=4041.61 samples/sec, GlobalRate=4043.10 samples/sec
+2023-11-29 20:09:56,022 INFO:   | Train Device=CSX, Step=500, Loss=7.50000, Rate=4035.92 samples/sec, GlobalRate=4040.90 samples/sec
+2023-11-29 20:10:21,410 INFO:   | Train Device=CSX, Step=600, Loss=7.37500, Rate=4034.41 samples/sec, GlobalRate=4039.65 samples/sec
+2023-11-29 20:10:46,690 INFO:   | Train Device=CSX, Step=700, Loss=7.37500, Rate=4044.10 samples/sec, GlobalRate=4041.20 samples/sec
+2023-11-29 20:11:12,004 INFO:   | Train Device=CSX, Step=800, Loss=7.25000, Rate=4044.75 samples/sec, GlobalRate=4041.70 samples/sec
+2023-11-29 20:11:37,196 INFO:   | Train Device=CSX, Step=900, Loss=7.21875, Rate=4056.77 samples/sec, GlobalRate=4044.25 samples/sec
+2023-11-29 20:12:02,285 INFO:   | Train Device=CSX, Step=1000, Loss=7.12500, Rate=4071.60 samples/sec, GlobalRate=4047.95 samples/sec
+2023-11-29 20:12:02,286 INFO:   Saving checkpoint at step 1000
+2023-11-29 20:12:37,079 INFO:   Saved checkpoint model_dir_bert_large_pytorch/checkpoint_1000.mdl
+2023-11-29 20:13:25,683 INFO:   Heartbeat thread stopped for wsjob-gfi2baioyfduozkmgsc6a7.
+2023-11-29 20:13:25,691 INFO:   Training completed successfully!
+2023-11-29 20:13:25,691 INFO:   Processed 1024000 sample(s) in 336.373620536 seconds.
 ```
 
-<!--- No longer part of the modelzoo
-## BERT - TensorFlow
-The modelzoo/modelzoo/transformers/tf/bert directory is a TensorFlow implementation of [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)<br>
-This BERT-large msl128 example uses a single sample dataset for both training and evaluation. See the README.md in the source directory for details on how to build a dataset from text input.
-First, source a Cerebras TensorFlow virtual environment.
-
-```console
-source ~/R_2.0.3/venv_tf/bin/activate
-```
-
-Then
-
-```console
-cd ~/R_2.0.3/modelzoo/modelzoo/transformers/tf/bert
-cp /software/cerebras/dataset/bert_large/params_bert_large_msl128_sampleds.yaml configs/params_bert_large_msl128_sampleds.yaml
-export MODEL_DIR=mytest
-if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=bert_tf --max_steps 1000 --params configs/params_bert_large_msl128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_2.0.3/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
-```
-
-The last parts of the output should resemble the following, with messages about cuda that should be ignored and are not shown.
-
-```console
-INFO:root:Finished sending initial weights
-INFO:tensorflow:global step 100: loss = 9.859375 (27.49 steps/sec)
-INFO:tensorflow:global step 200: loss = 9.28125 (21.77 steps/sec)
-INFO:tensorflow:global step 300: loss = 8.921875 (20.38 steps/sec)
-INFO:tensorflow:global step 400: loss = 8.3984375 (19.78 steps/sec)
-INFO:tensorflow:global step 500: loss = 8.1328125 (24.65 steps/sec)
-INFO:tensorflow:global step 600: loss = 7.8359375 (23.27 steps/sec)
-INFO:tensorflow:global step 700: loss = 7.69140625 (22.37 steps/sec)
-INFO:tensorflow:global step 800: loss = 7.75390625 (21.75 steps/sec)
-INFO:tensorflow:global step 900: loss = 7.63671875 (21.31 steps/sec)
-INFO:tensorflow:global step 1000: loss = 7.59375 (23.64 steps/sec)
-INFO:root:Training complete. Completed 256000 sample(s) in 42.299458026885986 seconds
-INFO:root:Taking final checkpoint at step: 1000
-...
-INFO:tensorflow:Saved checkpoint for global step 1000 in 67.17758774757385 seconds: mytest/model.ckpt-1000
-INFO:root:Monitoring returned
-```
---->
-
 ## GPT-J PyTorch
 
 GPT-J [[github]](https://github.com/kingoflolz/mesh-transformer-jax) is an auto-regressive language model created by [EleutherAI](https://www.eleuther.ai/).

From a8a674de133814ec84f00937af4b356c1c5aab78 Mon Sep 17 00:00:00 2001
From: Bill Arnold <arnoldw@anlgov>
Date: Wed, 29 Nov 2023 17:09:21 -0500
Subject: [PATCH 3/6] gpt-j for R 2.0.3 updates

---
 docs/ai-testbed/cerebras/example-programs.md | 56 +++-----------------
 1 file changed, 8 insertions(+), 48 deletions(-)

diff --git a/docs/ai-testbed/cerebras/example-programs.md b/docs/ai-testbed/cerebras/example-programs.md
index d1b4f09af..610219b0c 100644
--- a/docs/ai-testbed/cerebras/example-programs.md
+++ b/docs/ai-testbed/cerebras/example-programs.md
@@ -124,52 +124,12 @@ python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_samp
 The last parts of the output should resemble the following:
 
 ```console
-2023-05-17 18:44:38,290 INFO:   Finished sending initial weights
-2023-05-17 18:51:03,551 INFO:   | Train Device=xla:0, Step=100, Loss=8.46875, Rate=33.83 samples/sec, GlobalRate=33.83 samples/sec
-2023-05-17 18:57:26,199 INFO:   | Train Device=xla:0, Step=200, Loss=8.06250, Rate=33.92 samples/sec, GlobalRate=33.90 samples/sec
-2023-05-17 19:03:48,354 INFO:   | Train Device=xla:0, Step=300, Loss=7.71875, Rate=33.98 samples/sec, GlobalRate=33.94 samples/sec
-2023-05-17 19:10:10,299 INFO:   | Train Device=xla:0, Step=400, Loss=7.46875, Rate=34.01 samples/sec, GlobalRate=33.96 samples/sec
-2023-05-17 19:16:32,156 INFO:   | Train Device=xla:0, Step=500, Loss=7.21875, Rate=34.03 samples/sec, GlobalRate=33.98 samples/sec
-2023-05-17 19:16:32,157 INFO:   Saving checkpoint at global step 500
-2023-05-17 19:27:12,834 INFO:   Saving step 500 in dataloader checkpoint
-2023-05-17 19:27:13,435 INFO:   Saved checkpoint at global step: 500
-2023-05-17 19:27:13,436 INFO:   Training Complete. Completed 65000 sample(s) in 2554.1804394721985 seconds.
+2023-11-29 20:59:19,223 INFO:   Beginning appliance run
+2023-11-29 21:03:53,875 INFO:   | Train Device=CSX, Step=100, Loss=8.43750, Rate=43.70 samples/sec, GlobalRate=43.70 samples/sec
+2023-11-29 21:08:28,779 INFO:   | Train Device=CSX, Step=200, Loss=8.12500, Rate=43.67 samples/sec, GlobalRate=43.67 samples/sec
+2023-11-29 21:08:28,781 INFO:   Saving checkpoint at step 200
+2023-11-29 21:13:56,695 INFO:   Saved checkpoint model_dir_gptj/checkpoint_200.mdl
+2023-11-29 21:14:30,135 INFO:   Heartbeat thread stopped for wsjob-kd4olqkhu6ya8qqzt88utd.
+2023-11-29 21:14:30,142 INFO:   Training completed successfully!
+2023-11-29 21:14:30,142 INFO:   Processed 24000 sample(s) in 910.883781998 seconds.
 ```
-<!---
-## GPT-J TensorFlow
-
-GPT-J [[github]](https://github.com/kingoflolz/mesh-transformer-jax) is an auto-regressive language model created by [EleutherAI](https://www.eleuther.ai/).
-This TensorFlow GPT-J 6B parameter pretraining sample uses 2 CS2s.
-
-First, source a Cerebras TensorFlow virtual environment.
-
-
-source /software/cerebras/venvs/venv_tf/bin/activate
-# or your personal venv
-
-```console
-source ~/R_2.0.3/venv_tf/bin/activate
-```
-
-Then
-
-```console
-cd ~/R_2.0.3/modelzoo/modelzoo/transformers/tf/gptj
-cp /software/cerebras/dataset/gptj/params_gptj_6B_tf_sampleds.yaml configs/params_gptj_6B_sampleds.yaml
-export MODEL_DIR=model_dir_gptj_tf
-if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=gptj_tf --max_steps 500 --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_2.0.3/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
-```
-
-The last parts of the output should resemble the following:
-
-```console
-INFO:root:About to send initial weights
-INFO:root:Finished sending initial weights
-INFO:tensorflow:global step 500: loss = 6.044921875 (0.17 steps/sec)
-INFO:root:Training complete. Completed 65000 sample(s) in 2960.4926776885986 seconds
-INFO:root:Taking final checkpoint at step: 500
-INFO:tensorflow:Saved checkpoint for global step 500 in 304.37238907814026 seconds: model_dir_gptj_tf/model.ckpt-500
-INFO:root:Monitoring is over without any issue
-```
---->

From f533f297ed0cfad560174f3ff4331682d18ca567 Mon Sep 17 00:00:00 2001
From: Bill Arnold <arnoldw@anlgov>
Date: Wed, 29 Nov 2023 18:06:07 -0500
Subject: [PATCH 4/6] cleanup for R 2.0.3

---
 docs/ai-testbed/cerebras/job-queuing-and-submission.md | 6 +++---
 docs/ai-testbed/cerebras/system-overview.md            | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/ai-testbed/cerebras/job-queuing-and-submission.md b/docs/ai-testbed/cerebras/job-queuing-and-submission.md
index 3e7343778..8711204fc 100644
--- a/docs/ai-testbed/cerebras/job-queuing-and-submission.md
+++ b/docs/ai-testbed/cerebras/job-queuing-and-submission.md
@@ -2,7 +2,7 @@
 
 The CS-2 cluster has its own **Kubernetes-based** system for job submission and queuing.<br>
 
-Jobs are started automatically through the **Python** frameworks in modelzoo.common.pytorch.run_utils and modelzoo.common.tf.run_utils
+Jobs are started automatically through the **Python** framework in modelzoo.common.pytorch.run_utils
 Continuous job status for a job is output to stdout/stderr; redirect the output, or consider using a persistent session started with **screen**, or **tmux**, or both.
 
 Jobs that have not yet completed can be listed as shown. Note: this command can take over a minute to complete.
@@ -18,9 +18,9 @@ To view the grafana databoard for a job, follow the instructions at [Grafana WsJ
 Jobs can be canceled as shown:
 
 ```console
-(venv_tf) $ csctl cancel job wsjob-eyjapwgnycahq9tus4w7id
+(venv_cerebras_pt) $ csctl cancel job wsjob-eyjapwgnycahq9tus4w7id
 Job canceled successfully
-(venv_tf) $
+(venv_cerebras_pt) $
 ```
 
 Jobs can be labeled in the command line that launches them, if they are written with Cerebras's Python framework for running appliance jobs, by adding a command line option of this form:
diff --git a/docs/ai-testbed/cerebras/system-overview.md b/docs/ai-testbed/cerebras/system-overview.md
index 65e5fce7f..1807b0923 100644
--- a/docs/ai-testbed/cerebras/system-overview.md
+++ b/docs/ai-testbed/cerebras/system-overview.md
@@ -1,9 +1,9 @@
 # System Overview
 
-The Cerebras CS-2 is a wafer-scale deep learning accelerator comprising 850,000 processing cores, each providing 48KB of dedicated SRAM memory for an on-chip total of 40GB and interconnected to optimize bandwidth and latency. Its software platform integrates popular machine learning frameworks such as TensorFlow and PyTorch.
+The Cerebras CS-2 is a wafer-scale deep learning accelerator comprising 850,000 processing cores, each providing 48KB of dedicated SRAM memory for an on-chip total of 40GB and interconnected to optimize bandwidth and latency. Its software platform integrates the popular machine learning framework PyTorch.
 
 
-The ALCF CS-2 systems are configured as a Cerebras Wafer-Scale Cluster, designed to support large-scale models (up to and well beyond 1 billion parameters) and large-scale inputs. The cluster contains two CS-2 systems and can distribute jobs across one or both CS-2 systems in a data-parallel framework. The supporting CPU cluster consists of MemoryX, SwarmX, management, and input worker nodes. The Cerebras Wafer-Scale cluster is run as an appliance: a user submits a job to the appliance, and the appliance manages preprocessing and streaming of the data, IO, and device orchestration within the appliance. It provides programming via PyTorch and TensorFlow(estimator) with data-parallel distribution when using more than one CS-2. This installation supports both Pipelined execution for models up to 1 billion parameters and Weight Streaming execution for models up to and above 1 billion parameters.
+The ALCF CS-2 systems are configured as a Cerebras Wafer-Scale Cluster, designed to support large-scale models (up to and well beyond 1 billion parameters) and large-scale inputs. The cluster contains two CS-2 systems and can distribute jobs across one or both CS-2 systems in a data-parallel framework. The supporting CPU cluster consists of MemoryX, SwarmX, management, and input worker nodes. The Cerebras Wafer-Scale cluster is run as an appliance: a user submits a job to the appliance, and the appliance manages preprocessing and streaming of the data, IO, and device orchestration within the appliance. It provides programming via PyTorch, with data-parallel distribution when using more than one CS-2. This installation supports both Pipelined execution for models up to 1 billion parameters and Weight Streaming execution for models up to and above 1 billion parameters.
 <!--[You can Learn more about execution modes in Cerebras Execution Modes.]-->
 
 <!--

From 1641a796c0581a6ffeca2f6e82bc375cf1b3ff2a Mon Sep 17 00:00:00 2001
From: Bill Arnold <arnoldw@anlgov>
Date: Thu, 30 Nov 2023 15:01:13 -0500
Subject: [PATCH 5/6] add more instructions for installing requirements

---
 docs/ai-testbed/cerebras/example-programs.md           | 9 ++++++---
 docs/ai-testbed/cerebras/running-a-model-or-program.md | 3 +--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/ai-testbed/cerebras/example-programs.md b/docs/ai-testbed/cerebras/example-programs.md
index 610219b0c..270398e96 100644
--- a/docs/ai-testbed/cerebras/example-programs.md
+++ b/docs/ai-testbed/cerebras/example-programs.md
@@ -19,10 +19,11 @@ cp -r /software/cerebras/model_zoo/anl_shared/ ~/R_2.0.3/anl_shared
 
 An implementation of this: [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/pdf/1505.04597.pdf), Ronneberger et.  al 2015<br>
 To run Unet with the <a href="https://www.kaggle.com/c/severstal-steel-defect-detection">Severstal: Steel Defect Detection</a> kaggle dataset, using a pre-downloaded copy of the dataset:<br>
-First, source a Cerebras PyTorch virtual environment.
+First, source a Cerebras PyTorch virtual environment and make sure that requirements are installed.
 
 ```console
 source ~/R_2.0.3/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.0.3/modelzoo/requirements.txt
 ```
 
 Then
@@ -59,7 +60,7 @@ if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
 
 The modelzoo/modelzoo/transformers/pytorch/bert directory is a PyTorch implementation of [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)<br>
 This BERT-large msl128 example uses a single sample dataset for both training and evaluation. See the README.md in the source directory for details on how to build a dataset from text input.
-First, source a Cerebras PyTorch virtual environment.
+First, source a Cerebras PyTorch virtual environment and make sure that the requirements are installed:
 
 <!---
 source /software/cerebras/venvs/venv_cerebras_pt/bin/activate
@@ -67,6 +68,7 @@ source /software/cerebras/venvs/venv_cerebras_pt/bin/activate
 --->
 ```console
 source ~/R_2.0.3/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.0.3/modelzoo/requirements.txt
 ```
 
 Then
@@ -105,10 +107,11 @@ The last parts of the output should resemble the following, with messages about
 GPT-J [[github]](https://github.com/kingoflolz/mesh-transformer-jax) is an auto-regressive language model created by [EleutherAI](https://www.eleuther.ai/).
 This PyTorch GPT-J 6B parameter pretraining sample uses 2 CS2s.
 
-First, source a Cerebras PyTorch virtual environment.
+First, source a Cerebras PyTorch virtual environment and make sure that the requirements are installed:
 
 ```console
 source ~/R_2.0.3/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.0.3/modelzoo/requirements.txt
 ```
 
 Then
diff --git a/docs/ai-testbed/cerebras/running-a-model-or-program.md b/docs/ai-testbed/cerebras/running-a-model-or-program.md
index 47fc8bede..8a3454428 100644
--- a/docs/ai-testbed/cerebras/running-a-model-or-program.md
+++ b/docs/ai-testbed/cerebras/running-a-model-or-program.md
@@ -46,8 +46,7 @@ git checkout Release_2.0.3
 
 ```console
 source ~/R_2.0.3/venv_cerebras_pt/bin/activate
-cd ~/R_2.0.3/modelzoo
-pip install -r requirements.txt
+pip install -r ~/R_2.0.3/modelzoo/requirements.txt
 cd ~/R_2.0.3/modelzoo/modelzoo/fc_mnist/pytorch
 ```
 

From ce3555c29399908b6b48328b627e34bb1b66a1a4 Mon Sep 17 00:00:00 2001
From: Bill Arnold <arnoldw@anlgov>
Date: Fri, 1 Dec 2023 11:00:53 -0500
Subject: [PATCH 6/6] comment out instructions for Unet sample while we
 continue to debug it

---
 docs/ai-testbed/cerebras/example-programs.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/ai-testbed/cerebras/example-programs.md b/docs/ai-testbed/cerebras/example-programs.md
index 270398e96..6cff563d1 100644
--- a/docs/ai-testbed/cerebras/example-programs.md
+++ b/docs/ai-testbed/cerebras/example-programs.md
@@ -15,6 +15,7 @@ git checkout Release_2.0.3
 cp -r /software/cerebras/model_zoo/anl_shared/ ~/R_2.0.3/anl_shared
 --->
 
+<!---
 ## UNet
 
 An implementation of this: [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/pdf/1505.04597.pdf), Ronneberger et.  al 2015<br>
@@ -35,6 +36,7 @@ export MODEL_DIR=model_dir_unet
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
 python run.py CSX --job_labels name=unet_pt --params configs/params_severstal_binary_rawds.yaml --model_dir $MODEL_DIR --mode train --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.0.3/modelzoo/ --compile_dir $(whoami) |& tee mytest.log 
 ```
+--->
 
 <!--- Appears to not have been ported to 1.7.1
 ## BraggNN