diff --git a/modules/model_experimentation/proof_of_concept.ipynb b/modules/model_experimentation/proof_of_concept.ipynb index 6a28acd..1a80d3c 100644 --- a/modules/model_experimentation/proof_of_concept.ipynb +++ b/modules/model_experimentation/proof_of_concept.ipynb @@ -13,7 +13,7 @@ "id": "fca08236-d4f1-43b3-aaf0-94fad87555eb", "metadata": {}, "source": [ - "### Step 1: Extracting Ground Truth Text\n", + "### Extracting Ground Truth Text\n", "\n", "This steps assumes you have two directories in the same directory as this notebook:\n", "- ./racist_deeds_text\n", @@ -98,7 +98,7 @@ "id": "64cb1503-2e15-49a1-aec7-2b9d39cb7109", "metadata": {}, "source": [ - "### Step 2: Establishing a Baseline with Bag of Words and Logistic Regression\n", + "### Establishing a Baseline with Bag of Words and Logistic Regression\n", "\n", "The following code runs bag of words and logistic regression on the ground truth dataset of 750 non-racist deeds, and 61 racist deeds." ] @@ -243,7 +243,7 @@ "id": "67348b02", "metadata": {}, "source": [ - "Training a LSTM(Long short term memory) neural network model to classify the text data. " + "### Training a LSTM (Long Short Term Memory) NN Model to Classify Data" ] }, { @@ -253,9 +253,9 @@ "source": [ "Step 1: Data Loading and Preprocessing\n", "\n", - "1.Tokenizing: Converts words to integer sequences using the top 10,000 most common words, assigning out-of-vocabulary words a special token.\n", - "2. Sequencing: Turns each text into a sequence of integers based on word indices.\n", - "3. Padding: Pads sequences to a uniform length of 200 tokens, ensuring consistent input shape for models." + "- Converts words to integer sequences using the top 10,000 most common words, assigning out-of-vocabulary words a special token.\n", + "- Sequencing: Turns each text into a sequence of integers based on word indices.\n", + "- Padding: Pads sequences to a uniform length of 200 tokens, ensuring consistent input shape for models." ] }, { @@ -273,22 +273,17 @@ "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n", "from sklearn.model_selection import train_test_split\n", "\n", - "# Load the preprocessed data from the pickled file\n", "preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl')\n", "\n", - "# Extract texts and labels\n", "texts = preprocessed_data['original_text']\n", "labels = preprocessed_data['is_racist']\n", "\n", - "# Initialize the tokenizer with a vocabulary size of 10,000\n", "vocab_size = 10000\n", "tokenizer = Tokenizer(num_words=vocab_size, oov_token=\"\")\n", "tokenizer.fit_on_texts(texts)\n", "\n", - "# Convert texts to sequences of integers\n", "sequences = tokenizer.texts_to_sequences(texts)\n", "\n", - "# Pad the sequences to ensure uniform length\n", "max_length = 200 \n", "padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')" ] @@ -298,7 +293,7 @@ "id": "db143694", "metadata": {}, "source": [ - "Step 2: Glove embeddings \n", + "Step 2: Glove Embeddings \n", "\n", "(Need to retrieve glove.6B.100d.txt from https://nlp.stanford.edu/projects/glove/ and find the \"Pre-trained word vectors\" section and download the glove.6B.zip file)\n", "\n", @@ -366,7 +361,6 @@ } ], "source": [ - "# Define the LSTM model architecture\n", "model = Sequential()\n", "model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \n", " weights=[embedding_matrix], input_length=max_length, trainable=False))\n", @@ -375,7 +369,6 @@ "model.add(LSTM(32))\n", "model.add(Dense(1, activation='sigmoid'))\n", "\n", - "# Compile the model\n", "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])" ] }, @@ -423,10 +416,8 @@ } ], "source": [ - "# Split the data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.3, random_state=42)\n", "\n", - "# Train the model\n", "history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))" ] }, @@ -456,7 +447,6 @@ } ], "source": [ - "# Evaluate the model on the test data\n", "test_loss, test_accuracy = model.evaluate(X_test, y_test)\n", "print(f\"Test Accuracy: {test_accuracy:.2f}\")" ] @@ -547,7 +537,6 @@ "y_pred_prob = model.predict(X_test)\n", "y_pred = (y_pred_prob > 0.5).astype(int)\n", "\n", - "# 1. Confusion Matrix\n", "conf_matrix = confusion_matrix(y_test, y_pred)\n", "plt.figure(figsize=(6, 4))\n", "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')\n", @@ -556,7 +545,6 @@ "plt.title('Confusion Matrix')\n", "plt.show()\n", "\n", - "# 2. ROC Curve and AUC\n", "fpr, tpr, _ = roc_curve(y_test, y_pred_prob)\n", "roc_auc = auc(fpr, tpr)\n", "\n", @@ -1424,7 +1412,6 @@ "import numpy as np\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "\n", - "# Paths to folders with new deeds to evaluate\n", "racist_folder = 'racist_deeds_text'\n", "non_racist_folder = 'non_racist_deeds_text'\n", "\n", @@ -1482,7 +1469,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1496,7 +1483,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.10.12" } }, "nbformat": 4,