diff --git a/fa23-team-d/EDA_Notebooks/extension.ipynb b/fa23-team-d/EDA_Notebooks/extension.ipynb
new file mode 100644
index 0000000..20062da
--- /dev/null
+++ b/fa23-team-d/EDA_Notebooks/extension.ipynb
@@ -0,0 +1,1630 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " service_date | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " half_trip_id | \n",
+ " stop_id | \n",
+ " time_point_id | \n",
+ " time_point_order | \n",
+ " point_type | \n",
+ " standard_type | \n",
+ " scheduled | \n",
+ " actual | \n",
+ " scheduled_headway | \n",
+ " headway | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2023-01-01 | \n",
+ " 01 | \n",
+ " Inbound | \n",
+ " 58061899.0 | \n",
+ " 110 | \n",
+ " hhgat | \n",
+ " 1 | \n",
+ " Startpoint | \n",
+ " Schedule | \n",
+ " 1900-01-01T06:05:00Z | \n",
+ " 1900-01-01T06:05:04Z | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2023-01-01 | \n",
+ " 01 | \n",
+ " Inbound | \n",
+ " 58061899.0 | \n",
+ " 67 | \n",
+ " maput | \n",
+ " 2 | \n",
+ " Midpoint | \n",
+ " Schedule | \n",
+ " 1900-01-01T06:09:00Z | \n",
+ " 1900-01-01T06:06:28Z | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2023-01-01 | \n",
+ " 01 | \n",
+ " Inbound | \n",
+ " 58061899.0 | \n",
+ " 72 | \n",
+ " cntsq | \n",
+ " 3 | \n",
+ " Midpoint | \n",
+ " Schedule | \n",
+ " 1900-01-01T06:12:00Z | \n",
+ " 1900-01-01T06:08:57Z | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2023-01-01 | \n",
+ " 01 | \n",
+ " Inbound | \n",
+ " 58061899.0 | \n",
+ " 75 | \n",
+ " mit | \n",
+ " 4 | \n",
+ " Midpoint | \n",
+ " Schedule | \n",
+ " 1900-01-01T06:15:00Z | \n",
+ " 1900-01-01T06:12:41Z | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2023-01-01 | \n",
+ " 01 | \n",
+ " Inbound | \n",
+ " 58061899.0 | \n",
+ " 79 | \n",
+ " hynes | \n",
+ " 5 | \n",
+ " Midpoint | \n",
+ " Schedule | \n",
+ " 1900-01-01T06:19:00Z | \n",
+ " 1900-01-01T06:16:35Z | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " service_date route_id direction_id half_trip_id stop_id time_point_id \\\n",
+ "0 2023-01-01 01 Inbound 58061899.0 110 hhgat \n",
+ "1 2023-01-01 01 Inbound 58061899.0 67 maput \n",
+ "2 2023-01-01 01 Inbound 58061899.0 72 cntsq \n",
+ "3 2023-01-01 01 Inbound 58061899.0 75 mit \n",
+ "4 2023-01-01 01 Inbound 58061899.0 79 hynes \n",
+ "\n",
+ " time_point_order point_type standard_type scheduled \\\n",
+ "0 1 Startpoint Schedule 1900-01-01T06:05:00Z \n",
+ "1 2 Midpoint Schedule 1900-01-01T06:09:00Z \n",
+ "2 3 Midpoint Schedule 1900-01-01T06:12:00Z \n",
+ "3 4 Midpoint Schedule 1900-01-01T06:15:00Z \n",
+ "4 5 Midpoint Schedule 1900-01-01T06:19:00Z \n",
+ "\n",
+ " actual scheduled_headway headway \n",
+ "0 1900-01-01T06:05:04Z NaN NaN \n",
+ "1 1900-01-01T06:06:28Z NaN NaN \n",
+ "2 1900-01-01T06:08:57Z NaN NaN \n",
+ "3 1900-01-01T06:12:41Z NaN NaN \n",
+ "4 1900-01-01T06:16:35Z NaN NaN "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import pandas as pd\n",
+ "dfs = [] # Create an empty list to store dataframes\n",
+ "\n",
+ "# Example: Read multiple CSV files\n",
+ "arr_dep_dir = '../data/MBTA_Website/MBTA_Bus_Arrival_Departure_Times_2023/'\n",
+ "csv_files = os.listdir(arr_dep_dir)\n",
+ "csv_files = [os.path.join(arr_dep_dir, i) for i in csv_files][:6] \n",
+ "for f in csv_files:\n",
+ " df = pd.read_csv(f)\n",
+ " dfs.append(df)\n",
+ "\n",
+ "df = pd.concat(dfs, axis=0, ignore_index=True)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reading sheet: Age\n",
+ "Reading sheet: Household Type\n",
+ "Reading sheet: Race\n",
+ "Reading sheet: Group Quarters Population\n",
+ "Reading sheet: Nativity\n",
+ "Reading sheet: Geographic Mobility\n",
+ "Reading sheet: Educational Attainment\n",
+ "Reading sheet: School Enrollment\n",
+ "Reading sheet: Means of Commuting\n",
+ "Reading sheet: Travel Time to Work\n",
+ "Reading sheet: Place of Work\n",
+ "Reading sheet: Per Capita Income\n",
+ "Reading sheet: Occupation\n",
+ "Reading sheet: Industries\n",
+ "Reading sheet: Labor Force\n",
+ "Reading sheet: Household Income\n",
+ "Reading sheet: Family Income\n",
+ "Reading sheet: Housing Tenure\n",
+ "Reading sheet: Bedrooms\n",
+ "Reading sheet: Vacancy Rates\n",
+ "Reading sheet: Vehicles per Household\n",
+ "Reading sheet: Poverty Rates\n",
+ "Reading sheet: Poverty Rates by Age\n"
+ ]
+ }
+ ],
+ "source": [
+ "file_path = '../data/2015-2019_neighborhood_tables_2021.12.21.xlsm'\n",
+ "\n",
+ "# Get the sheet names in the Excel file\n",
+ "xl = pd.ExcelFile(file_path)\n",
+ "sheet_names = xl.sheet_names\n",
+ "\n",
+ "# Create a dictionary to store data frames for each sheet\n",
+ "dfs = {}\n",
+ "\n",
+ "# Loop through each sheet and read it into a data frame\n",
+ "for sheet_name in sheet_names:\n",
+ " print(f'Reading sheet: {sheet_name}')\n",
+ " df2 = xl.parse(sheet_name) # You can use parse with sheet_name or parse with sheet index\n",
+ " dfs[sheet_name] = df2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "neighborhood_data = {\n",
+ " 'Place': ['Allston', 'Back Bay', 'Beacon Hill', 'Brighton', 'Charlestown', 'Chinatown', 'Dorchester', 'Downtown',\n",
+ " 'East Boston', 'Fenway', 'Hyde Park', 'Jamaica Plain', 'Longwood', 'Mattapan', 'Mission Hill', 'North End',\n",
+ " 'Roslindale', 'Roxbury', 'South Boston', 'South Boston Waterfront', 'South End', 'West End', 'West Roxbury'],\n",
+ " 'Latitude': [42.355537, 42.350707, 42.358708, 42.3489, 42.3787, 42.3492, 42.2995, 42.3555, 42.375097,\n",
+ " 42.345187, 42.2557, 42.311605, 42.3389, 42.272321, 42.333265, 42.365097, 42.291209, 42.3126,\n",
+ " 42.333431, 42.351938, 42.341310, 42.363919, 42.279265],\n",
+ " 'Longitude': [-71.132749, -71.079730, -71.067829, -71.1605, -71.0616, -71.0621, -71.0649, -71.0565, -71.039217,\n",
+ " -71.104599, -71.1256, -71.114384, -71.1072, -71.086995, -71.102029, -71.054495, -71.124497, -71.0899,\n",
+ " -71.049495, -71.049883, -71.077230, -71.063899, -71.149497],\n",
+ "}\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Place | \n",
+ " Latitude | \n",
+ " Longitude | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Allston | \n",
+ " 42.355537 | \n",
+ " -71.132749 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Back Bay | \n",
+ " 42.350707 | \n",
+ " -71.079730 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Beacon Hill | \n",
+ " 42.358708 | \n",
+ " -71.067829 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Brighton | \n",
+ " 42.348900 | \n",
+ " -71.160500 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Charlestown | \n",
+ " 42.378700 | \n",
+ " -71.061600 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Place Latitude Longitude\n",
+ "0 Allston 42.355537 -71.132749\n",
+ "1 Back Bay 42.350707 -71.079730\n",
+ "2 Beacon Hill 42.358708 -71.067829\n",
+ "3 Brighton 42.348900 -71.160500\n",
+ "4 Charlestown 42.378700 -71.061600"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "neighborhood_data_df = pd.DataFrame(neighborhood_data)\n",
+ "\n",
+ "neighborhood_data_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "bus_stops_data = json.load(open('/Users/xavierohan/Documents/GitHub/ds-boston-transit-performance/fa23-team-d/data/stops.json'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Place Latitude Longitude num_stops\n",
+ "0 Allston 42.355537 -71.132749 507\n",
+ "1 Back Bay 42.350707 -71.079730 49\n",
+ "2 Beacon Hill 42.358708 -71.067829 7\n",
+ "3 Brighton 42.348900 -71.160500 730\n",
+ "4 Charlestown 42.378700 -71.061600 876\n",
+ "5 Chinatown 42.349200 -71.062100 30\n",
+ "6 Dorchester 42.299500 -71.064900 487\n",
+ "7 Downtown 42.355500 -71.056500 26\n",
+ "8 East Boston 42.375097 -71.039217 1229\n",
+ "9 Fenway 42.345187 -71.104599 66\n",
+ "10 Hyde Park 42.255700 -71.125600 375\n",
+ "11 Jamaica Plain 42.311605 -71.114384 66\n",
+ "12 Longwood 42.338900 -71.107200 40\n",
+ "13 Mattapan 42.272321 -71.086995 715\n",
+ "14 Mission Hill 42.333265 -71.102029 78\n",
+ "15 North End 42.365097 -71.054495 12\n",
+ "16 Roslindale 42.291209 -71.124497 104\n",
+ "17 Roxbury 42.312600 -71.089900 130\n",
+ "18 South Boston 42.333431 -71.049495 144\n",
+ "19 South Boston Waterfront 42.351938 -71.049883 28\n",
+ "20 South End 42.341310 -71.077230 85\n",
+ "21 West End 42.363919 -71.063899 12\n",
+ "22 West Roxbury 42.279265 -71.149497 234\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "from geopy.distance import geodesic\n",
+ "\n",
+ "# Given data\n",
+ "neighborhood_data = {\n",
+ " 'Place': ['Allston', 'Back Bay', 'Beacon Hill', 'Brighton', 'Charlestown', 'Chinatown', 'Dorchester', 'Downtown',\n",
+ " 'East Boston', 'Fenway', 'Hyde Park', 'Jamaica Plain', 'Longwood', 'Mattapan', 'Mission Hill', 'North End',\n",
+ " 'Roslindale', 'Roxbury', 'South Boston', 'South Boston Waterfront', 'South End', 'West End', 'West Roxbury'],\n",
+ " 'Latitude': [42.355537, 42.350707, 42.358708, 42.3489, 42.3787, 42.3492, 42.2995, 42.3555, 42.375097,\n",
+ " 42.345187, 42.2557, 42.311605, 42.3389, 42.272321, 42.333265, 42.365097, 42.291209, 42.3126,\n",
+ " 42.333431, 42.351938, 42.341310, 42.363919, 42.279265],\n",
+ " 'Longitude': [-71.132749, -71.079730, -71.067829, -71.1605, -71.0616, -71.0621, -71.0649, -71.0565, -71.039217,\n",
+ " -71.104599, -71.1256, -71.114384, -71.1072, -71.086995, -71.102029, -71.054495, -71.124497, -71.0899,\n",
+ " -71.049495, -71.049883, -71.077230, -71.063899, -71.149497],\n",
+ "}\n",
+ "\n",
+ "\n",
+ "# Assign neighborhood to each bus stop and count stops for each neighborhood\n",
+ "neighborhood_counts = {neighborhood: 0 for neighborhood in neighborhood_data['Place']}\n",
+ "for stop, stop_data in bus_stops_data.items():\n",
+ " min_distance = float('inf')\n",
+ " closest_neighborhood = None\n",
+ "\n",
+ " for i, neighborhood in enumerate(neighborhood_data['Place']):\n",
+ " distance = geodesic((stop_data['latitude'], stop_data['longitude']), (neighborhood_data['Latitude'][i], neighborhood_data['Longitude'][i])).meters\n",
+ "\n",
+ " if distance < min_distance:\n",
+ " min_distance = distance\n",
+ " closest_neighborhood = neighborhood\n",
+ "\n",
+ " bus_stops_data[stop]['neighbourhood'] = closest_neighborhood\n",
+ " neighborhood_counts[closest_neighborhood] += 1\n",
+ "\n",
+ "\n",
+ "# Add a new column \"num_stops\" to neighborhood_data_df\n",
+ "neighborhood_data_df['num_stops'] = neighborhood_data_df['Place'].map(neighborhood_counts)\n",
+ "\n",
+ "# Display the updated DataFrame\n",
+ "print(neighborhood_data_df)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Neighborhood | \n",
+ " Total Public Transit | \n",
+ " Bus | \n",
+ " Subway or Train | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Boston | \n",
+ " 33.2% | \n",
+ " 13.5% | \n",
+ " 19.7% | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Allston | \n",
+ " 38.2% | \n",
+ " 18.8% | \n",
+ " 19.4% | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Back Bay | \n",
+ " 24.8% | \n",
+ " 2.9% | \n",
+ " 21.9% | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Beacon Hill | \n",
+ " 20.3% | \n",
+ " 2.2% | \n",
+ " 18.1% | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Brighton | \n",
+ " 33.1% | \n",
+ " 14.5% | \n",
+ " 18.7% | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Neighborhood Total Public Transit Bus Subway or Train\n",
+ "0 Boston 33.2% 13.5% 19.7%\n",
+ "1 Allston 38.2% 18.8% 19.4%\n",
+ "2 Back Bay 24.8% 2.9% 21.9%\n",
+ "3 Beacon Hill 20.3% 2.2% 18.1%\n",
+ "4 Brighton 33.1% 14.5% 18.7%"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "means_of_comm = {\n",
+ " 'Boston': {'Total Public Transit': '33.2%', 'Bus': '13.5%', 'Subway or Train': '19.7%'},\n",
+ " 'Allston': {'Total Public Transit': '38.2%', 'Bus': '18.8%', 'Subway or Train': '19.4%'},\n",
+ " 'Back Bay': {'Total Public Transit': '24.8%', 'Bus': '2.9%', 'Subway or Train': '21.9%'},\n",
+ " 'Beacon Hill': {'Total Public Transit': '20.3%', 'Bus': '2.2%', 'Subway or Train': '18.1%'},\n",
+ " 'Brighton': {'Total Public Transit': '33.1%', 'Bus': '14.5%', 'Subway or Train': '18.7%'},\n",
+ " 'Charlestown': {'Total Public Transit': '25.3%', 'Bus': '12.3%', 'Subway or Train': '12.9%'},\n",
+ " 'Dorchester': {'Total Public Transit': '36.5%', 'Bus': '19.6%', 'Subway or Train': '16.8%'},\n",
+ " 'Downtown': {'Total Public Transit': '20.1%', 'Bus': '2.6%', 'Subway or Train': '17.5%'},\n",
+ " 'East Boston': {'Total Public Transit': '53.6%', 'Bus': '4.7%', 'Subway or Train': '49.0%'},\n",
+ " 'Fenway': {'Total Public Transit': '26.3%', 'Bus': '11.0%', 'Subway or Train': '15.2%'},\n",
+ " 'Hyde Park': {'Total Public Transit': '24.6%', 'Bus': '12.1%', 'Subway or Train': '12.4%'},\n",
+ " 'Jamaica Plain': {'Total Public Transit': '41.8%', 'Bus': '9.9%', 'Subway or Train': '31.9%'},\n",
+ " 'Longwood': {'Total Public Transit': '14.4%', 'Bus': '6.5%', 'Subway or Train': '7.9%'},\n",
+ " 'Mattapan': {'Total Public Transit': '33.4%', 'Bus': '20.0%', 'Subway or Train': '13.4%'},\n",
+ " 'Mission Hill': {'Total Public Transit': '38.6%', 'Bus': '10.4%', 'Subway or Train': '28.2%'},\n",
+ " 'North End': {'Total Public Transit': '23.9%', 'Bus': '0.9%', 'Subway or Train': '23.1%'},\n",
+ " 'Roslindale': {'Total Public Transit': '29.4%', 'Bus': '9.4%', 'Subway or Train': '20.0%'},\n",
+ " 'Roxbury': {'Total Public Transit': '41.9%', 'Bus': '31.2%', 'Subway or Train': '10.7%'},\n",
+ " 'South Boston': {'Total Public Transit': '36.6%', 'Bus': '22.6%', 'Subway or Train': '14.0%'},\n",
+ " 'South Boston Waterfront': {'Total Public Transit': '18.3%', 'Bus': '6.9%', 'Subway or Train': '11.4%'},\n",
+ " 'South End': {'Total Public Transit': '25.8%', 'Bus': '8.5%', 'Subway or Train': '17.3%'},\n",
+ " 'West End': {'Total Public Transit': '22.0%', 'Bus': '3.0%', 'Subway or Train': '19.0%'},\n",
+ " 'West Roxbury': {'Total Public Transit': '18.6%', 'Bus': '6.7%', 'Subway or Train': '11.9%'}\n",
+ "}\n",
+ "# data from https://data.boston.gov/dataset/neighborhood-demographics/resource/d8c23c6a-b868-4ba4-8a3b-b9615a21be07\n",
+ "\n",
+ "meanas_of_comm_df = pd.DataFrame(means_of_comm).transpose()\n",
+ "means_of_comm_df = meanas_of_comm_df.reset_index()\n",
+ "means_of_comm_df.columns = ['Neighborhood', 'Total Public Transit', 'Bus', 'Subway or Train']\n",
+ "means_of_comm_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-------------------------+-----------+------+\n",
+ "| Place | num_stops | Bus |\n",
+ "+-------------------------+-----------+------+\n",
+ "| Roxbury | 130.0 | 31.2 |\n",
+ "| South Boston | 144.0 | 22.6 |\n",
+ "| Mattapan | 715.0 | 20.0 |\n",
+ "| Dorchester | 487.0 | 19.6 |\n",
+ "| Allston | 507.0 | 18.8 |\n",
+ "| Brighton | 730.0 | 14.5 |\n",
+ "| Charlestown | 876.0 | 12.3 |\n",
+ "| Hyde Park | 375.0 | 12.1 |\n",
+ "| Fenway | 66.0 | 11.0 |\n",
+ "| Mission Hill | 78.0 | 10.4 |\n",
+ "| Jamaica Plain | 66.0 | 9.9 |\n",
+ "| Roslindale | 104.0 | 9.4 |\n",
+ "| South End | 85.0 | 8.5 |\n",
+ "| South Boston Waterfront | 28.0 | 6.9 |\n",
+ "| West Roxbury | 234.0 | 6.7 |\n",
+ "| Longwood | 40.0 | 6.5 |\n",
+ "| East Boston | 1229.0 | 4.7 |\n",
+ "| West End | 12.0 | 3.0 |\n",
+ "| Back Bay | 49.0 | 2.9 |\n",
+ "| Downtown | 26.0 | 2.6 |\n",
+ "| Beacon Hill | 7.0 | 2.2 |\n",
+ "| North End | 12.0 | 0.9 |\n",
+ "+-------------------------+-----------+------+\n"
+ ]
+ }
+ ],
+ "source": [
+ "combined_df = pd.merge(neighborhood_data_df, means_of_comm_df, left_on='Place', right_on='Neighborhood', how='outer')\n",
+ "\n",
+ "# Drop the redundant 'Neighborhood' column\n",
+ "combined_df = combined_df.drop(columns='Neighborhood')\n",
+ "\n",
+ "combined_df['Bus'] = pd.to_numeric(combined_df['Bus'].str.rstrip('%'))\n",
+ "# Display the combined dataframe\n",
+ "# print(combined_df[['Place', 'num_stops', 'Bus']].dropna().sort_values(by='Bus', ascending=False))\n",
+ "\n",
+ "from tabulate import tabulate\n",
+ "\n",
+ "# Assuming 'combined_df' is the combined dataframe\n",
+ "table = tabulate(combined_df[['Place', 'num_stops', 'Bus']].dropna().sort_values(by='Bus', ascending=False),\n",
+ " headers='keys', tablefmt='pretty', showindex=False)\n",
+ "\n",
+ "print(table)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " stop_id Neighborhood\n",
+ "0 1 South End\n",
+ "1 2 South End\n",
+ "2 3 South End\n",
+ "3 4 South End\n",
+ "4 5 South End\n",
+ "... ... ...\n",
+ "5798 109901 East Boston\n",
+ "5799 109912 East Boston\n",
+ "5800 869411 Brighton\n",
+ "5801 869451 Brighton\n",
+ "5802 883321 Brighton\n",
+ "\n",
+ "[5803 rows x 2 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_neighborhoods = pd.DataFrame(neighborhood_data)\n",
+ "stop_data = pd.read_csv('/Users/xavierohan/Documents/GitHub/ds-boston-transit-performance/fa23-team-d/data/stops_locations.csv') \n",
+ "# Function to calculate the distance between two points using Haversine formula\n",
+ "def haversine(coord1, coord2):\n",
+ " return geodesic(coord1, coord2).miles\n",
+ "\n",
+ "# Function to find the closest neighborhood to a stop\n",
+ "def find_neighborhood(stop_coord):\n",
+ " distances = df_neighborhoods.apply(lambda row: haversine(stop_coord, (row['Latitude'], row['Longitude'])), axis=1)\n",
+ " closest_neighborhood = df_neighborhoods.loc[distances.idxmin(), 'Place']\n",
+ " return closest_neighborhood\n",
+ "\n",
+ "# Assuming that the column 'route_id' represents the routes passing through stops\n",
+ "# Create a new column 'Neighborhood' in df_stops to store the assigned neighborhood for each stop\n",
+ "stop_data['Neighborhood'] = stop_data.apply(lambda row: find_neighborhood((row['X'], row['Y'])), axis=1)\n",
+ "\n",
+ "\n",
+ "# Group stops by route_id and aggregate the list of unique neighborhoods for each route_id\n",
+ "neighborhoods_by_stop = stop_data.groupby('stop_id')['Neighborhood'].unique().reset_index()\n",
+ "neighborhoods_by_stop['Neighborhood'] = neighborhoods_by_stop['Neighborhood'].apply(lambda x: x[0] if x else '')\n",
+ "# Now, neighborhoods_by_route contains the routes and associated neighborhoods\n",
+ "print(neighborhoods_by_stop)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "47: ['South End' 'Fenway' 'Chinatown' 'Longwood' 'Mission Hill']\n",
+ "65: ['Fenway' 'Brighton' 'Allston' 'Longwood']\n",
+ "76: ['Allston' 'Brighton']\n",
+ "44: ['Roxbury' 'Mission Hill' 'South End']\n",
+ "CT2: ['Fenway' 'Longwood' 'Mission Hill' 'Charlestown' 'Back Bay' 'South End']\n",
+ "{'47': ['South End', 'Fenway', 'Chinatown', 'Longwood', 'Mission Hill'], '65': ['Fenway', 'Brighton', 'Allston', 'Longwood'], '76': ['Allston', 'Brighton'], '44': ['Roxbury', 'Mission Hill', 'South End'], 'CT2': ['Fenway', 'Longwood', 'Mission Hill', 'Charlestown', 'Back Bay', 'South End']}\n"
+ ]
+ }
+ ],
+ "source": [
+ "route_ranking = pd.read_csv('/Users/xavierohan/Documents/GitHub/ds-boston-transit-performance/fa23-team-d/data/route_ranking.csv')\n",
+ "\n",
+ "stop_ids_by_route = df.groupby('route_id').agg({'stop_id': 'unique', 'time_point_order': 'unique'}).reset_index()\n",
+ "# Define the list of route_ids you want to visualize\n",
+ "route_ids = route_ranking[-5:]['route_id'].values \n",
+ "\n",
+ "top_10_most_late_routes_neighbourhoods = []\n",
+ "unique_neighborhoods_by_route = {}\n",
+ "for route_id in route_ids:\n",
+ " # Get the list of stop_ids for the route\n",
+ " stop_ids = stop_ids_by_route.loc[stop_ids_by_route['route_id'] == route_id, 'stop_id'].values[0]\n",
+ " # Get the list of neighborhoods for the route\n",
+ " neighborhoods = neighborhoods_by_stop[neighborhoods_by_stop['stop_id'].isin(stop_ids)]['Neighborhood'].unique()\n",
+ " for neighborhood in neighborhoods:\n",
+ " if neighborhood not in top_10_most_late_routes_neighbourhoods:\n",
+ " top_10_most_late_routes_neighbourhoods.append(neighborhood)\n",
+ " # Print the route_id and associated neighborhoods\n",
+ " print(f'{route_id}: {neighborhoods}')\n",
+ " if route_id not in unique_neighborhoods_by_route:\n",
+ " unique_neighborhoods_by_route[route_id] = neighborhoods.tolist()\n",
+ " else:\n",
+ " unique_neighborhoods_by_route[route_id].extend(neighborhoods.tolist())\n",
+ "print(unique_neighborhoods_by_route)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['South End',\n",
+ " 'Fenway',\n",
+ " 'Chinatown',\n",
+ " 'Longwood',\n",
+ " 'Mission Hill',\n",
+ " 'Brighton',\n",
+ " 'Allston',\n",
+ " 'Roxbury',\n",
+ " 'Charlestown',\n",
+ " 'Back Bay']"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "top_10_most_late_routes_neighbourhoods"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "746_: ['Downtown' 'South Boston Waterfront']\n",
+ "171: ['South Boston' 'South End' 'East Boston']\n",
+ "SL2: ['South Boston Waterfront' 'Downtown']\n",
+ "SL1: ['Downtown' 'East Boston' 'South Boston Waterfront']\n",
+ "351: ['Brighton']\n",
+ "{'746_': ['Downtown', 'South Boston Waterfront'], '171': ['South Boston', 'South End', 'East Boston'], 'SL2': ['South Boston Waterfront', 'Downtown'], 'SL1': ['Downtown', 'East Boston', 'South Boston Waterfront'], '351': ['Brighton']}\n"
+ ]
+ }
+ ],
+ "source": [
+ "route_ranking = pd.read_csv('/Users/xavierohan/Documents/GitHub/ds-boston-transit-performance/fa23-team-d/data/route_ranking.csv')\n",
+ "\n",
+ "stop_ids_by_route = df.groupby('route_id').agg({'stop_id': 'unique', 'time_point_order': 'unique'}).reset_index()\n",
+ "# Define the list of route_ids you want to visualize\n",
+ "route_ids = route_ranking[:5]['route_id'].values \n",
+ "\n",
+ "top_10_on_time_routes_neighbourhoods = []\n",
+ "unique_neighborhoods_by_route = {}\n",
+ "for route_id in route_ids:\n",
+ " # Get the list of stop_ids for the route\n",
+ " stop_ids = stop_ids_by_route.loc[stop_ids_by_route['route_id'] == route_id, 'stop_id'].values[0]\n",
+ " # Get the list of neighborhoods for the route\n",
+ " neighborhoods = neighborhoods_by_stop[neighborhoods_by_stop['stop_id'].isin(stop_ids)]['Neighborhood'].unique()\n",
+ " for neighborhood in neighborhoods:\n",
+ " if neighborhood not in top_10_on_time_routes_neighbourhoods:\n",
+ " top_10_on_time_routes_neighbourhoods.append(neighborhood)\n",
+ " # Print the route_id and associated neighborhoods\n",
+ " print(f'{route_id}: {neighborhoods}')\n",
+ " if route_id not in unique_neighborhoods_by_route:\n",
+ " unique_neighborhoods_by_route[route_id] = neighborhoods.tolist()\n",
+ " else:\n",
+ " unique_neighborhoods_by_route[route_id].extend(neighborhoods.tolist())\n",
+ "print(unique_neighborhoods_by_route)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Downtown',\n",
+ " 'South Boston Waterfront',\n",
+ " 'South Boston',\n",
+ " 'South End',\n",
+ " 'East Boston',\n",
+ " 'Brighton']"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "top_10_on_time_routes_neighbourhoods"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "age_data = {\n",
+ " 'Allston': {'Population': 19261, 'Median Age': 27.5},\n",
+ " 'Brighton': {'Population': 55297, 'Median Age': 30.8},\n",
+ " 'Charlestown': {'Population': 19890, 'Median Age': 35.7},\n",
+ " 'Dorchester': {'Population': 126909, 'Median Age': 33.4},\n",
+ " 'Downtown': {'Population': 18306, 'Median Age': 33.5},\n",
+ " 'East Boston': {'Population': 47263, 'Median Age': 30.6},\n",
+ " 'Fenway': {'Population': 33489, 'Median Age': 26.3},\n",
+ " 'Hyde Park': {'Population': 38924, 'Median Age': 39.4},\n",
+ " 'Jamaica Plain': {'Population': 40867, 'Median Age': 34.8},\n",
+ " 'Longwood': {'Population': 5351, 'Median Age': 20.2},\n",
+ " 'Mattapan': {'Population': 26659, 'Median Age': 36.7},\n",
+ " 'Mission Hill': {'Population': 17386, 'Median Age': 30.1},\n",
+ " 'North End': {'Population': 8749, 'Median Age': 31.1},\n",
+ " 'Roslindale': {'Population': 30021, 'Median Age': 39.8},\n",
+ " 'Roxbury': {'Population': 54161, 'Median Age': 32.5},\n",
+ " 'South Boston': {'Population': 36772, 'Median Age': 31.9},\n",
+ " 'South Boston Waterfront': {'Population': 4403, 'Median Age': 34.5},\n",
+ " 'South End': {'Population': 32571, 'Median Age': 37.1},\n",
+ " 'West End': {'Population': 6619, 'Median Age': 37.8},\n",
+ " 'West Roxbury': {'Population': 33526, 'Median Age': 42.8}\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-------------------------+-----------+------------+\n",
+ "| Place | num_stops | Median Age |\n",
+ "+-------------------------+-----------+------------+\n",
+ "| West Roxbury | 234 | 42.8 |\n",
+ "| Roslindale | 104 | 39.8 |\n",
+ "| Hyde Park | 375 | 39.4 |\n",
+ "| West End | 12 | 37.8 |\n",
+ "| South End | 85 | 37.1 |\n",
+ "| Mattapan | 715 | 36.7 |\n",
+ "| Charlestown | 876 | 35.7 |\n",
+ "| Jamaica Plain | 66 | 34.8 |\n",
+ "| South Boston Waterfront | 28 | 34.5 |\n",
+ "| Downtown | 26 | 33.5 |\n",
+ "| Dorchester | 487 | 33.4 |\n",
+ "| Roxbury | 130 | 32.5 |\n",
+ "| South Boston | 144 | 31.9 |\n",
+ "| North End | 12 | 31.1 |\n",
+ "| Brighton | 730 | 30.8 |\n",
+ "| East Boston | 1229 | 30.6 |\n",
+ "| Mission Hill | 78 | 30.1 |\n",
+ "| Allston | 507 | 27.5 |\n",
+ "| Fenway | 66 | 26.3 |\n",
+ "| Longwood | 40 | 20.2 |\n",
+ "+-------------------------+-----------+------------+\n"
+ ]
+ }
+ ],
+ "source": [
+ "age_data_df = pd.DataFrame(age_data).transpose()\n",
+ "age_data_df = age_data_df.reset_index()\n",
+ "age_data_df.columns = ['Neighborhood', 'Population', 'Median Age']\n",
+ "\n",
+ "combined_df = pd.merge(neighborhood_data_df, age_data_df, left_on='Place', right_on='Neighborhood', how='outer')\n",
+ "\n",
+ "combined_df['Median Age'] = pd.to_numeric(combined_df['Median Age'])\n",
+ "# Display the combined dataframe\n",
+ "# print(combined_df[['Place', 'num_stops', 'Bus']].dropna().sort_values(by='Bus', ascending=False))\n",
+ "combined_df = combined_df[['Place', 'num_stops', 'Median Age']].dropna().sort_values(by='Median Age', ascending=False)\n",
+ "from tabulate import tabulate\n",
+ "\n",
+ "# Assuming 'combined_df' is the combined dataframe\n",
+ "table = tabulate(combined_df,headers='keys', tablefmt='pretty', showindex=False)\n",
+ "\n",
+ "print(table)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['West Roxbury' 'Roslindale' 'Hyde Park' 'West End' 'South End' 'Mattapan'\n",
+ " 'Charlestown' 'Jamaica Plain' 'South Boston Waterfront' 'Downtown'\n",
+ " 'Dorchester' 'Roxbury' 'South Boston' 'North End' 'Brighton'\n",
+ " 'East Boston' 'Mission Hill' 'Allston' 'Fenway' 'Longwood']\n",
+ "['South End', 'Fenway', 'Longwood', 'Mission Hill', 'Brighton', 'Allston', 'Roxbury', 'Charlestown']\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "