From e785aae0d87b7374e887dc83e347552023cbf97f Mon Sep 17 00:00:00 2001
From: Jacob Stein <jmstein@bu.edu>
Date: Sun, 3 Nov 2024 19:03:44 -0800
Subject: [PATCH] Update PoC

---
 .../proof_of_concept.ipynb                    | 31 ++++++-------------
 1 file changed, 9 insertions(+), 22 deletions(-)
diff --git a/modules/model_experimentation/proof_of_concept.ipynb b/modules/model_experimentation/proof_of_concept.ipynb
index 6a28acd..1a80d3c 100644
--- a/modules/model_experimentation/proof_of_concept.ipynb
+++ b/modules/model_experimentation/proof_of_concept.ipynb
@@ -13,7 +13,7 @@
    "id": "fca08236-d4f1-43b3-aaf0-94fad87555eb",
    "metadata": {},
    "source": [
-    "### Step 1: Extracting Ground Truth Text\n",
+    "### Extracting Ground Truth Text\n",
     "\n",
     "This steps assumes you have two directories in the same directory as this notebook:\n",
     "- ./racist_deeds_text\n",
@@ -98,7 +98,7 @@
    "id": "64cb1503-2e15-49a1-aec7-2b9d39cb7109",
    "metadata": {},
    "source": [
-    "### Step 2: Establishing a Baseline with Bag of Words and Logistic Regression\n",
+    "### Establishing a Baseline with Bag of Words and Logistic Regression\n",
     "\n",
     "The following code runs bag of words and logistic regression on the ground truth dataset of 750 non-racist deeds, and 61 racist deeds."
    ]
@@ -243,7 +243,7 @@
    "id": "67348b02",
    "metadata": {},
    "source": [
-    "Training a LSTM(Long short term memory) neural network model to classify the text data. "
+    "### Training a LSTM (Long Short Term Memory) NN Model to Classify Data"
    ]
   },
   {
@@ -253,9 +253,9 @@
    "source": [
     "Step 1: Data Loading and Preprocessing\n",
     "\n",
-    "1.Tokenizing: Converts words to integer sequences using the top 10,000 most common words, assigning out-of-vocabulary words a special <OOV> token.\n",
-    "2. Sequencing: Turns each text into a sequence of integers based on word indices.\n",
-    "3. Padding: Pads sequences to a uniform length of 200 tokens, ensuring consistent input shape for models."
+    "- Converts words to integer sequences using the top 10,000 most common words, assigning out-of-vocabulary words a special <OOV> token.\n",
+    "- Sequencing: Turns each text into a sequence of integers based on word indices.\n",
+    "- Padding: Pads sequences to a uniform length of 200 tokens, ensuring consistent input shape for models."
    ]
   },
   {
@@ -273,22 +273,17 @@
     "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n",
     "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "# Load the preprocessed data from the pickled file\n",
     "preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl')\n",
     "\n",
-    "# Extract texts and labels\n",
     "texts = preprocessed_data['original_text']\n",
     "labels = preprocessed_data['is_racist']\n",
     "\n",
-    "# Initialize the tokenizer with a vocabulary size of 10,000\n",
     "vocab_size = 10000\n",
     "tokenizer = Tokenizer(num_words=vocab_size, oov_token=\"<OOV>\")\n",
     "tokenizer.fit_on_texts(texts)\n",
     "\n",
-    "# Convert texts to sequences of integers\n",
     "sequences = tokenizer.texts_to_sequences(texts)\n",
     "\n",
-    "# Pad the sequences to ensure uniform length\n",
     "max_length = 200  \n",
     "padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')"
    ]
@@ -298,7 +293,7 @@
    "id": "db143694",
    "metadata": {},
    "source": [
-    "Step 2: Glove embeddings \n",
+    "Step 2: Glove Embeddings \n",
     "\n",
     "(Need to retrieve glove.6B.100d.txt from https://nlp.stanford.edu/projects/glove/ and find the \"Pre-trained word vectors\" section and download the glove.6B.zip file)\n",
     "\n",
@@ -366,7 +361,6 @@
     }
    ],
    "source": [
-    "# Define the LSTM model architecture\n",
     "model = Sequential()\n",
     "model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \n",
     "                    weights=[embedding_matrix], input_length=max_length, trainable=False))\n",
@@ -375,7 +369,6 @@
     "model.add(LSTM(32))\n",
     "model.add(Dense(1, activation='sigmoid'))\n",
     "\n",
-    "# Compile the model\n",
     "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])"
    ]
   },
@@ -423,10 +416,8 @@
     }
    ],
    "source": [
-    "# Split the data into training and testing sets\n",
     "X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.3, random_state=42)\n",
     "\n",
-    "# Train the model\n",
     "history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
    ]
   },
@@ -456,7 +447,6 @@
     }
    ],
    "source": [
-    "# Evaluate the model on the test data\n",
     "test_loss, test_accuracy = model.evaluate(X_test, y_test)\n",
     "print(f\"Test Accuracy: {test_accuracy:.2f}\")"
    ]
@@ -547,7 +537,6 @@
     "y_pred_prob = model.predict(X_test)\n",
     "y_pred = (y_pred_prob > 0.5).astype(int)\n",
     "\n",
-    "# 1. Confusion Matrix\n",
     "conf_matrix = confusion_matrix(y_test, y_pred)\n",
     "plt.figure(figsize=(6, 4))\n",
     "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')\n",
@@ -556,7 +545,6 @@
     "plt.title('Confusion Matrix')\n",
     "plt.show()\n",
     "\n",
-    "# 2. ROC Curve and AUC\n",
     "fpr, tpr, _ = roc_curve(y_test, y_pred_prob)\n",
     "roc_auc = auc(fpr, tpr)\n",
     "\n",
@@ -1424,7 +1412,6 @@
     "import numpy as np\n",
     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
     "\n",
-    "# Paths to folders with new deeds to evaluate\n",
     "racist_folder = 'racist_deeds_text'\n",
     "non_racist_folder = 'non_racist_deeds_text'\n",
     "\n",
@@ -1482,7 +1469,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1496,7 +1483,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.6"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,