Merge pull request #66 from BU-Spark/main

summer wrap up
BU-Spark · Oct 4, 2024 · 0576285 · 0576285
2 parents f8ff32b + bbfb2b9
commit 0576285
Show file tree

Hide file tree

Showing 5 changed files with 3 additions and 0 deletions.
diff --git a/Summer Wrap-Up/City of Boston Final Presentation.pptx b/Summer Wrap-Up/City of Boston Final Presentation.pptx
diff --git a/Summer Wrap-Up/Data/BaseQuestions.ipynb b/Summer Wrap-Up/Data/BaseQuestions.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPD+3xv+9V77fCOcYzVyChJ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["#install required packages\n","import numpy as np\n","import pandas as pd\n","from plotnine import *\n","from datetime import datetime"],"metadata":{"id":"p_1apd4BQeoW"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Question 1: End to End Time"],"metadata":{"id":"qS5GxufRQUvA"}},{"cell_type":"code","source":["def process_and_clean_files():\n","    for month in range(1, 13):\n","        input_file = f\"___.csv\"\n","        output_file = f\"____.csv\"\n","\n","        print(f'Processing file: {input_file}')\n","\n","        # Read CSV file into a DataFrame\n","        df = pd.read_csv(input_file)\n","\n","        # Keep only the required columns\n","        df = df[['route_id', 'stop_id', 'direction_id', 'half_trip_id', 'point_type', 'scheduled', 'actual']]\n","\n","        # Keep only rows where 'point_type' is 'Startpoint' or 'Endpoint'\n","        #df = df[df['point_type'].isin(['Startpoint', 'Endpoint'])]\n","\n","        # Convert time format columns\n","        df['scheduled'] = pd.to_datetime(df['scheduled']).dt.strftime('%H:%M:%S')\n","        df['actual'] = pd.to_datetime(df['actual']).dt.strftime('%H:%M:%S')\n","\n","        # Order rows\n","        df = df.sort_values(by=['route_id', 'half_trip_id'], ascending=[True, True])\n","\n","        # Save the cleaned DataFrame to a new CSV file\n","        df.to_csv(output_file, index=False)\n","        print(f'Cleaned file saved as: {output_file}')\n","\n","process_and_clean_files()"],"metadata":{"id":"5aWyfSrvRGfy"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def process_and_calculate_end_to_end_times():\n","    for month in range(1, 13):\n","        input_file = f\"___.csv\"\n","        output_file = f\"___.csv\"\n","\n","        print(f\"Processing file: {input_file}\")\n","\n","        # Read the file into a DataFrame\n","        df = pd.read_csv(input_file)\n","\n","        # Filter out rows where point_type is 'Startpoint' or 'Endpoint'\n","        startpoint = df[df['point_type'] == 'Startpoint']\n","        endpoint = df[df['point_type'] == 'Endpoint']\n","\n","        # Merge startpoint and endpoint data on 'route_id', 'direction_id', and 'half_trip_id'\n","        merged = pd.merge(startpoint, endpoint, on=['route_id', 'direction_id', 'half_trip_id'], suffixes=('_start', '_end'))\n","\n","        # Convert time columns to datetime format\n","        merged['scheduled_start'] = pd.to_datetime(merged['scheduled_start'])\n","        merged['scheduled_end'] = pd.to_datetime(merged['scheduled_end'])\n","        merged['actual_start'] = pd.to_datetime(merged['actual_start'])\n","        merged['actual_end'] = pd.to_datetime(merged['actual_end'])\n","\n","        # Calculate end-to-end time by subtracting scheduled times\n","        merged['scheduled_end_to_end'] = (merged['scheduled_end'] - merged['scheduled_start']).dt.total_seconds() / 60\n","        merged['actual_end_to_end'] = (merged['actual_end'] - merged['actual_start']).dt.total_seconds() / 60\n","\n","        # Adjust end-to-end time for cases where end time is before start time (crossing over midnight)\n","        merged['scheduled_end_to_end'] = merged.apply(lambda row: row['scheduled_end_to_end'] + 1440 if row['scheduled_end'] < row['scheduled_start'] else row['scheduled_end_to_end'], axis=1)\n","        merged['actual_end_to_end'] = merged.apply(lambda row: row['actual_end_to_end'] + 1440 if row['actual_end'] < row['actual_start'] else row['actual_end_to_end'], axis=1)\n","\n","        # Drop rows with missing/null values in 'actual_end_to_end' column\n","        merged.dropna(subset=['actual_end_to_end'], inplace=True)\n","\n","        # Calculate the difference between scheduled and actual end-to-end times\n","        merged['difference'] = merged['scheduled_end_to_end'] - merged['actual_end_to_end']\n","\n","        # Select necessary columns\n","        result = merged[['route_id', 'direction_id', 'half_trip_id', 'scheduled_end_to_end', 'actual_end_to_end', 'difference']]\n","\n","        # Save the result to a new file\n","        result.to_csv(output_file, index=False)\n","        print(f\"Result saved to {output_file}\\n\")\n","\n","# Process and calculate end-to-end times for all files\n","process_and_calculate_end_to_end_times()"],"metadata":{"id":"CZ0q-WhJR3Fi"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Question 2: Route Lateness"],"metadata":{"id":"jOK2Qu5kSvQ4"}},{"cell_type":"code","source":["def process_and_calculate_route_average_lateness():\n","    for month in range(1, 13):\n","        input_file = f\"___.csv\"\n","        output_file = f\"___.csv\"\n","\n","        print(f\"Processing file: {input_file}\")\n","\n","        # Read the dataset\n","        df = pd.read_csv(input_file)\n","\n","        # Convert time columns to datetime format\n","        df['scheduled'] = pd.to_datetime(df['scheduled'])\n","        df['actual'] = pd.to_datetime(df['actual'])\n","\n","        # Calculate lateness\n","        df['lateness'] = abs((df['scheduled'] - df['actual']).dt.total_seconds() / 60)  # Convert to minutes\n","\n","        # Group by 'route_id' and calculate average lateness across all stops\n","        route_average_lateness = df.groupby('route_id')['lateness'].mean().reset_index()\n","        route_average_lateness.rename(columns={'lateness': 'average_lateness'}, inplace=True)\n","\n","        # Save the result to a new file\n","        route_average_lateness.to_csv(output_file, index=False)\n","        print(f\"Average lateness per route for month {month} saved to {output_file}\\n\")\n","\n","# Process and calculate route average lateness for all files\n","process_and_calculate_route_average_lateness()\n"],"metadata":{"id":"Ls5nh2zQSyCU"},"execution_count":null,"outputs":[]}]}
diff --git a/Summer Wrap-Up/Data/READ.md b/Summer Wrap-Up/Data/READ.md
@@ -0,0 +1 @@
+The code is located in this git, but the produced CSV files must be located on the Google Drive
diff --git a/Summer Wrap-Up/READ.md b/Summer Wrap-Up/READ.md
@@ -0,0 +1 @@
+Summer 2024 Wrap-Up: Spark! X-Lab
diff --git a/Summer Wrap-Up/Report Draft.docx b/Summer Wrap-Up/Report Draft.docx
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPD+3xv+9V77fCOcYzVyChJ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["#install required packages\n","import numpy as np\n","import pandas as pd\n","from plotnine import *\n","from datetime import datetime"],"metadata":{"id":"p_1apd4BQeoW"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Question 1: End to End Time"],"metadata":{"id":"qS5GxufRQUvA"}},{"cell_type":"code","source":["def process_and_clean_files():\n"," for month in range(1, 13):\n"," input_file = f\"___.csv\"\n"," output_file = f\"____.csv\"\n","\n"," print(f'Processing file: {input_file}')\n","\n"," # Read CSV file into a DataFrame\n"," df = pd.read_csv(input_file)\n","\n"," # Keep only the required columns\n"," df = df[['route_id', 'stop_id', 'direction_id', 'half_trip_id', 'point_type', 'scheduled', 'actual']]\n","\n"," # Keep only rows where 'point_type' is 'Startpoint' or 'Endpoint'\n"," #df = df[df['point_type'].isin(['Startpoint', 'Endpoint'])]\n","\n"," # Convert time format columns\n"," df['scheduled'] = pd.to_datetime(df['scheduled']).dt.strftime('%H:%M:%S')\n"," df['actual'] = pd.to_datetime(df['actual']).dt.strftime('%H:%M:%S')\n","\n"," # Order rows\n"," df = df.sort_values(by=['route_id', 'half_trip_id'], ascending=[True, True])\n","\n"," # Save the cleaned DataFrame to a new CSV file\n"," df.to_csv(output_file, index=False)\n"," print(f'Cleaned file saved as: {output_file}')\n","\n","process_and_clean_files()"],"metadata":{"id":"5aWyfSrvRGfy"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def process_and_calculate_end_to_end_times():\n"," for month in range(1, 13):\n"," input_file = f\"___.csv\"\n"," output_file = f\"___.csv\"\n","\n"," print(f\"Processing file: {input_file}\")\n","\n"," # Read the file into a DataFrame\n"," df = pd.read_csv(input_file)\n","\n"," # Filter out rows where point_type is 'Startpoint' or 'Endpoint'\n"," startpoint = df[df['point_type'] == 'Startpoint']\n"," endpoint = df[df['point_type'] == 'Endpoint']\n","\n"," # Merge startpoint and endpoint data on 'route_id', 'direction_id', and 'half_trip_id'\n"," merged = pd.merge(startpoint, endpoint, on=['route_id', 'direction_id', 'half_trip_id'], suffixes=('_start', '_end'))\n","\n"," # Convert time columns to datetime format\n"," merged['scheduled_start'] = pd.to_datetime(merged['scheduled_start'])\n"," merged['scheduled_end'] = pd.to_datetime(merged['scheduled_end'])\n"," merged['actual_start'] = pd.to_datetime(merged['actual_start'])\n"," merged['actual_end'] = pd.to_datetime(merged['actual_end'])\n","\n"," # Calculate end-to-end time by subtracting scheduled times\n"," merged['scheduled_end_to_end'] = (merged['scheduled_end'] - merged['scheduled_start']).dt.total_seconds() / 60\n"," merged['actual_end_to_end'] = (merged['actual_end'] - merged['actual_start']).dt.total_seconds() / 60\n","\n"," # Adjust end-to-end time for cases where end time is before start time (crossing over midnight)\n"," merged['scheduled_end_to_end'] = merged.apply(lambda row: row['scheduled_end_to_end'] + 1440 if row['scheduled_end'] < row['scheduled_start'] else row['scheduled_end_to_end'], axis=1)\n"," merged['actual_end_to_end'] = merged.apply(lambda row: row['actual_end_to_end'] + 1440 if row['actual_end'] < row['actual_start'] else row['actual_end_to_end'], axis=1)\n","\n"," # Drop rows with missing/null values in 'actual_end_to_end' column\n"," merged.dropna(subset=['actual_end_to_end'], inplace=True)\n","\n"," # Calculate the difference between scheduled and actual end-to-end times\n"," merged['difference'] = merged['scheduled_end_to_end'] - merged['actual_end_to_end']\n","\n"," # Select necessary columns\n"," result = merged[['route_id', 'direction_id', 'half_trip_id', 'scheduled_end_to_end', 'actual_end_to_end', 'difference']]\n","\n"," # Save the result to a new file\n"," result.to_csv(output_file, index=False)\n"," print(f\"Result saved to {output_file}\\n\")\n","\n","# Process and calculate end-to-end times for all files\n","process_and_calculate_end_to_end_times()"],"metadata":{"id":"CZ0q-WhJR3Fi"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Question 2: Route Lateness"],"metadata":{"id":"jOK2Qu5kSvQ4"}},{"cell_type":"code","source":["def process_and_calculate_route_average_lateness():\n"," for month in range(1, 13):\n"," input_file = f\"___.csv\"\n"," output_file = f\"___.csv\"\n","\n"," print(f\"Processing file: {input_file}\")\n","\n"," # Read the dataset\n"," df = pd.read_csv(input_file)\n","\n"," # Convert time columns to datetime format\n"," df['scheduled'] = pd.to_datetime(df['scheduled'])\n"," df['actual'] = pd.to_datetime(df['actual'])\n","\n"," # Calculate lateness\n"," df['lateness'] = abs((df['scheduled'] - df['actual']).dt.total_seconds() / 60) # Convert to minutes\n","\n"," # Group by 'route_id' and calculate average lateness across all stops\n"," route_average_lateness = df.groupby('route_id')['lateness'].mean().reset_index()\n"," route_average_lateness.rename(columns={'lateness': 'average_lateness'}, inplace=True)\n","\n"," # Save the result to a new file\n"," route_average_lateness.to_csv(output_file, index=False)\n"," print(f\"Average lateness per route for month {month} saved to {output_file}\\n\")\n","\n","# Process and calculate route average lateness for all files\n","process_and_calculate_route_average_lateness()\n"],"metadata":{"id":"Ls5nh2zQSyCU"},"execution_count":null,"outputs":[]}]}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The code is located in this git, but the produced CSV files must be located on the Google Drive