-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #66 from BU-Spark/main
summer wrap up
- Loading branch information
Showing
5 changed files
with
3 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPD+3xv+9V77fCOcYzVyChJ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["#install required packages\n","import numpy as np\n","import pandas as pd\n","from plotnine import *\n","from datetime import datetime"],"metadata":{"id":"p_1apd4BQeoW"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Question 1: End to End Time"],"metadata":{"id":"qS5GxufRQUvA"}},{"cell_type":"code","source":["def process_and_clean_files():\n"," for month in range(1, 13):\n"," input_file = f\"___.csv\"\n"," output_file = f\"____.csv\"\n","\n"," print(f'Processing file: {input_file}')\n","\n"," # Read CSV file into a DataFrame\n"," df = pd.read_csv(input_file)\n","\n"," # Keep only the required columns\n"," df = df[['route_id', 'stop_id', 'direction_id', 'half_trip_id', 'point_type', 'scheduled', 'actual']]\n","\n"," # Keep only rows where 'point_type' is 'Startpoint' or 'Endpoint'\n"," #df = df[df['point_type'].isin(['Startpoint', 'Endpoint'])]\n","\n"," # Convert time format columns\n"," df['scheduled'] = pd.to_datetime(df['scheduled']).dt.strftime('%H:%M:%S')\n"," df['actual'] = pd.to_datetime(df['actual']).dt.strftime('%H:%M:%S')\n","\n"," # Order rows\n"," df = df.sort_values(by=['route_id', 'half_trip_id'], ascending=[True, True])\n","\n"," # Save the cleaned DataFrame to a new CSV file\n"," df.to_csv(output_file, index=False)\n"," print(f'Cleaned file saved as: {output_file}')\n","\n","process_and_clean_files()"],"metadata":{"id":"5aWyfSrvRGfy"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def process_and_calculate_end_to_end_times():\n"," for month in range(1, 13):\n"," input_file = f\"___.csv\"\n"," output_file = f\"___.csv\"\n","\n"," print(f\"Processing file: {input_file}\")\n","\n"," # Read the file into a DataFrame\n"," df = pd.read_csv(input_file)\n","\n"," # Filter out rows where point_type is 'Startpoint' or 'Endpoint'\n"," startpoint = df[df['point_type'] == 'Startpoint']\n"," endpoint = df[df['point_type'] == 'Endpoint']\n","\n"," # Merge startpoint and endpoint data on 'route_id', 'direction_id', and 'half_trip_id'\n"," merged = pd.merge(startpoint, endpoint, on=['route_id', 'direction_id', 'half_trip_id'], suffixes=('_start', '_end'))\n","\n"," # Convert time columns to datetime format\n"," merged['scheduled_start'] = pd.to_datetime(merged['scheduled_start'])\n"," merged['scheduled_end'] = pd.to_datetime(merged['scheduled_end'])\n"," merged['actual_start'] = pd.to_datetime(merged['actual_start'])\n"," merged['actual_end'] = pd.to_datetime(merged['actual_end'])\n","\n"," # Calculate end-to-end time by subtracting scheduled times\n"," merged['scheduled_end_to_end'] = (merged['scheduled_end'] - merged['scheduled_start']).dt.total_seconds() / 60\n"," merged['actual_end_to_end'] = (merged['actual_end'] - merged['actual_start']).dt.total_seconds() / 60\n","\n"," # Adjust end-to-end time for cases where end time is before start time (crossing over midnight)\n"," merged['scheduled_end_to_end'] = merged.apply(lambda row: row['scheduled_end_to_end'] + 1440 if row['scheduled_end'] < row['scheduled_start'] else row['scheduled_end_to_end'], axis=1)\n"," merged['actual_end_to_end'] = merged.apply(lambda row: row['actual_end_to_end'] + 1440 if row['actual_end'] < row['actual_start'] else row['actual_end_to_end'], axis=1)\n","\n"," # Drop rows with missing/null values in 'actual_end_to_end' column\n"," merged.dropna(subset=['actual_end_to_end'], inplace=True)\n","\n"," # Calculate the difference between scheduled and actual end-to-end times\n"," merged['difference'] = merged['scheduled_end_to_end'] - merged['actual_end_to_end']\n","\n"," # Select necessary columns\n"," result = merged[['route_id', 'direction_id', 'half_trip_id', 'scheduled_end_to_end', 'actual_end_to_end', 'difference']]\n","\n"," # Save the result to a new file\n"," result.to_csv(output_file, index=False)\n"," print(f\"Result saved to {output_file}\\n\")\n","\n","# Process and calculate end-to-end times for all files\n","process_and_calculate_end_to_end_times()"],"metadata":{"id":"CZ0q-WhJR3Fi"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Question 2: Route Lateness"],"metadata":{"id":"jOK2Qu5kSvQ4"}},{"cell_type":"code","source":["def process_and_calculate_route_average_lateness():\n"," for month in range(1, 13):\n"," input_file = f\"___.csv\"\n"," output_file = f\"___.csv\"\n","\n"," print(f\"Processing file: {input_file}\")\n","\n"," # Read the dataset\n"," df = pd.read_csv(input_file)\n","\n"," # Convert time columns to datetime format\n"," df['scheduled'] = pd.to_datetime(df['scheduled'])\n"," df['actual'] = pd.to_datetime(df['actual'])\n","\n"," # Calculate lateness\n"," df['lateness'] = abs((df['scheduled'] - df['actual']).dt.total_seconds() / 60) # Convert to minutes\n","\n"," # Group by 'route_id' and calculate average lateness across all stops\n"," route_average_lateness = df.groupby('route_id')['lateness'].mean().reset_index()\n"," route_average_lateness.rename(columns={'lateness': 'average_lateness'}, inplace=True)\n","\n"," # Save the result to a new file\n"," route_average_lateness.to_csv(output_file, index=False)\n"," print(f\"Average lateness per route for month {month} saved to {output_file}\\n\")\n","\n","# Process and calculate route average lateness for all files\n","process_and_calculate_route_average_lateness()\n"],"metadata":{"id":"Ls5nh2zQSyCU"},"execution_count":null,"outputs":[]}]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
The code is located in this git, but the produced CSV files must be located on the Google Drive |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Summer 2024 Wrap-Up: Spark! X-Lab |
Binary file not shown.