# data processing
import pandas as pd
import geopandas as gpd
import numpy as np
import carto2gpd
import esri2gpd

# basic graphics
from matplotlib import pyplot as plt
import holoviews as hv
import hvplot.pandas
import altair as alt
import folium
import seaborn as sns
import matplotlib.colors as mcolors

# advanced graphics
import panel as pn
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap
import param
import folium as fm

# census
import cenpy

# osm
import osmnx as ox

# bus ridership data
from sodapy import Socrata

# analysis
import math as mt
##!pip install kneed
from kneed import KneeLocator
## sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

# shapely
from shapely.geometry import Polygon, LineString, Point

# regressoin
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# od analysis- dask:
import dask.array as da
import dask.dataframe as dd
from colorcet import fire
import geoviews as gv
import geoviews.tile_sources as gvts


import pandas as pd


# get the bus ridership data from API
client = Socrata("data.cityofchicago.org", None)
results = client.get("mq3i-nnqe", limit=11593) # length of data shown on their website

bus_ridership_df = pd.DataFrame.from_records(results)

# add geometry
bus_ridership_lat = [row['latitude'] for row in bus_ridership_df['location']]
bus_ridership_lat_df = pd.DataFrame(np.column_stack([bus_ridership_lat]))
bus_ridership_lon = [row['longitude'] for row in bus_ridership_df['location']]
bus_ridership_lon_df = pd.DataFrame(np.column_stack([bus_ridership_lon]))

bus_ridership_df['latitude'] = bus_ridership_lat_df
bus_ridership_df['longitude'] = bus_ridership_lon_df

bus_ridership_gdf = gpd.GeoDataFrame(
    bus_ridership_df, geometry=gpd.points_from_xy(x=bus_ridership_df.longitude, y=bus_ridership_df.latitude)
)

# create a gdf for bus stop data
bus_data = bus_ridership_gdf[['stop_id', 'on_street', 'cross_street', 'routes', 'boardings', 'alightings', 'geometry']]

# set the crs of bus stops to epsg:3857
bus_data.crs = "EPSG:4326"
bus_data = bus_data.to_crs("epsg:3857")

WARNING:root:Requests made without an app_token will be subject to strict throttling limits.


# make cenpy api connection
acs = cenpy.remote.APIConnection("ACSDT5Y2019")

# get block group gemetry from census data
# set geo server
acs.set_mapservice("tigerWMS_ACS2019")

# Use SQL to return geometries only for Cook County, IL
where_clause = f"STATE = {'17'} AND COUNTY = {'031'}"

# Query for block groups
cook_bg = acs.mapservice.layers[10].query(where=where_clause)

# calculate the total area of each block group
cook_bg["TOTALAREA"] = cook_bg.area

# import Chicago city limit
city_boundary_gdf = gpd.read_file("https://data.cityofchicago.org/resource/qqq8-j68g.geojson").to_crs('epsg:3857')

# get blcok groups within Chicaco
joined = gpd.sjoin(cook_bg, city_boundary_gdf, op='intersects', how="inner") 

# there is a huge block group in the Michigan Lake, find out it's GEOID by sorting the total area
cook_bg.sort_values(by = "TOTALAREA", ascending = False).head(1)['GEOID']

# remove it from the dataset
joined = joined[joined["GEOID"] != "170319900000"]

# we only need the geometry and GEOID for the block groups
chicago_bg = joined[['GEOID', 'geometry']]

# listing socioeconomics variables from 2019 5-year acs 
variables = [
    "NAME",
    "B01003_001E", # total pop
    "B25064_001E", # median rent
    "B19113_001E", # median income
    "B01002_001E", # median age
    "B15003_001E", # educational attainment for the pop 25 and over 
    "B15003_022E", # total bachelor's degree
    "B15003_023E", # master's degree                    
    "B15003_025E", # doctoral
    "B02001_002E", # white
    "B25003_001E", # tenure: total
    "B25003_002E", # tenure: owner
    "B08134_001E", # total commuters
    "B08134_061E", # commute by public transp
    "GEO_ID"
]

# query from the cenpy api
cook_data = acs.query(
    cols=variables,
    geo_unit="block group:*",
    geo_filter={"state": "17", 
                "county": "031", 
                "tract": "*"},
)

# rename the columns for easier references
dict = {"NAME": "NAME",
    "B01003_001E": "total_pop",
    "B25064_001E": "median_rent",
    "B19113_001E": "median_income",
    "B01002_001E": "median_age",
    "B15003_001E": "educational_attainment_pop25" ,
    "B15003_022E": "bachelor_degree",
    "B15003_023E": "master_degree"  ,                  
    "B15003_025E": "doctoral_degree",
    "B02001_002E": "white",
    "B25003_001E": "tenure_total",
    "B25003_002E": "tenure_owner",
    "B08134_001E": "commuter_total",
    "B08134_061E": "commuter_PuT",
    "GEO_ID": "GEO_ID"}

cook_data.rename(columns=dict, inplace=True)

# getting the GEOID for spatial join
cook_data["GEOID"] = [x[9:] for x in cook_data['GEO_ID']]

# merge the chicago block group gemoetry with socioeconomic variables by GEOID
chicago_data = chicago_bg.merge(cook_data, on = "GEOID") 

# converting strings to numeric input
for variable in chicago_data.columns:
    # Convert all variables
    if variable != "NAME" and variable != "GEO_ID" and variable != "geometry" and variable != "GEOID":
        chicago_data[variable] = chicago_data[variable].astype(float)
        
# spatial join the boarding and alighting from bus stop data
chicago_bg = gpd.sjoin(chicago_bg, bus_data, how = "left")

chicago_bg = chicago_bg[["GEOID", "boardings", "alightings"]]

# converting strings to numeric input
for variable in chicago_bg.columns:
    if variable != "GEOID":
    # Convert all variables
        chicago_bg[variable] = chicago_bg[variable].astype(float)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\pyproj\crs\crs.py:68: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))


# find the average abordings and alightings in each chicago block group
bus_bg = chicago_bg.groupby("GEOID")[["boardings", "alightings"]].mean()

# change all NaN value to 0 and round all boardings and alightings to integers
bus_bg = bus_bg.replace(np.nan, 0).round(0)

# merge the socioeconomics data with bus stop data
chicago_data = chicago_data.merge(bus_bg, on = "GEOID")


# data cleanning: replace error values with average values. 
## median rent: replace -666666666.0 in median rent with average. median rent of all tract.
chicago_data['median_rent'] = chicago_data['median_rent'].replace([-666666666.0],np.nan)
mean = chicago_data['median_rent'].mean()
chicago_data.loc[:,'median_rent'] = chicago_data.loc[:,'median_rent'].fillna(mean)

## median income: replace -666666666.0 with mean 
chicago_data['median_income'] = chicago_data['median_income'].replace([-666666666.0],np.nan)
mean = chicago_data['median_income'].mean()
chicago_data.loc[:,'median_income'] = chicago_data.loc[:,'median_income'].fillna(mean)

## median age: 
chicago_data['median_age'] = chicago_data['median_age'].replace([-666666666.0],np.nan)
mean = chicago_data['median_age'].mean()
chicago_data.loc[:,'median_age'] = chicago_data.loc[:,'median_age'].fillna(mean)

## commuter_total:
chicago_data['commuter_PuT'] = chicago_data['commuter_PuT'].replace(np.nan, chicago_data['commuter_PuT'].mean())

# percent of people who have college degree or above
chicago_data["pct_college_degree"] = (chicago_data["bachelor_degree"] + chicago_data["master_degree"] + chicago_data["doctoral_degree"]) / chicago_data["educational_attainment_pop25"]
# replace nan with 0 since total can be 0
chicago_data["pct_college_degree"] = chicago_data["pct_college_degree"].replace(np.nan, 0)

# percent of racial minorities
chicago_data["pct_racial_minorities"] = 1 - chicago_data["white"] / chicago_data["total_pop"]
# replace nan with 0 since total can be 0
chicago_data["pct_racial_minorities"] = chicago_data["pct_racial_minorities"].replace(np.nan, 0)

# percent of property owner
chicago_data["pct_property_owner"] = chicago_data['tenure_owner'] / chicago_data['tenure_total']
# replace nan with 0 since total can be 0
chicago_data["pct_property_owner"] = chicago_data["pct_property_owner"].replace(np.nan, 0)

# percent of commuters using public transportation
chicago_data["pct_commuter_PuT"] = chicago_data['commuter_PuT'] / chicago_data['commuter_total']
# replace nan with 0 since total can be 0
chicago_data["pct_commuter_PuT"] = chicago_data["pct_commuter_PuT"].replace(np.nan, 0)


# load the city limit of Chicago
chicago_limit = gpd.read_file('data/Boundaries - City.geojson').set_crs("epsg:4326", allow_override=True,)
chicago_limit = chicago_limit.squeeze().geometry


# get the number of residential building in each block group from OSM data
residential = ox.geometries_from_polygon(
    chicago_limit, tags={"building": "residential"}
)


# get the number of office building in each block group from OSM data
office = ox.geometries_from_polygon(
    chicago_limit, tags={"building": "office"}
)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'MultiPolygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out
C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'Polygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


def get_xy_from_geometry(df):
    # NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries 
    x = df.geometry.centroid.x
    y = df.geometry.centroid.y
    
    return np.column_stack((x, y)) # stack as columns


# STEP 1: x/y coordinates of restaurants (in EPGS=3857)
resXY = get_xy_from_geometry(residential.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=10)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(resXY)

# STEP 4: Get distances for stations to neighbors
resDists, resIndices = nbrs.kneighbors(get_xy_from_geometry(chicago_data))

# STEP 5: add back to the original dataset
chicago_data['DistRes'] = resDists.mean(axis=1)


# STEP 1: x/y coordinates of restaurants (in EPGS=3857)
resXY = get_xy_from_geometry(office.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=10)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(resXY)

# STEP 4: Get distances for stations to neighbors
officeDists, officeIndices = nbrs.kneighbors(get_xy_from_geometry(chicago_data))

# STEP 5: add back to the original dataset
chicago_data['DistOffice'] = officeDists.mean(axis=1)


# import chicago CBD boundary
chicago_cbd_limit = gpd.read_file('data/Boundaries - City.geojson').set_crs("epsg:4326", allow_override=True,)

# get the centroid of the cbd polygon to represent cbd
chicago_cbd = chicago_cbd_limit.to_crs("epsg:3857").centroid

chicago_cbd = Point(chicago_cbd.x, chicago_cbd.y)

# get distance to cbd
chicago_data['dist_to_cbd'] = chicago_data.geometry.distance(chicago_cbd)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'MultiPolygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


# chicago_data_filter: dataframe with only useful variables without geometry
# chicago_data_filter_GEOID: dataframe with only useful variables and GEOID without geometry
# filter the df with needed numeric data 
chicago_data_filter_GEOID = chicago_data.drop(columns=['NAME', 'state', 'county','block group','tract','geometry', 'GEO_ID', 
                                                'bachelor_degree', 'master_degree', 'doctoral_degree', 'educational_attainment_pop25', 'white', 'tenure_owner', 'tenure_total', 'commuter_PuT', 'commuter_total'])
chicago_data_filter = chicago_data.drop(columns=['NAME', 'state', 'county','block group','tract','geometry', 'GEO_ID', 'GEOID',
                                                'bachelor_degree', 'master_degree', 'doctoral_degree', 'educational_attainment_pop25', 'white', 'tenure_owner', 'tenure_total', 'commuter_PuT', 'commuter_total'])


# chicago_data_GEOID: geodataframe with GEOID and useful variables with geometry
# chicago_data: geodataframe with only useful variables with geometry
chicago_data_GEOID = chicago_data.drop(columns=['NAME', 'state', 'county','block group','tract', 'GEO_ID', 
                                                'bachelor_degree', 'master_degree', 'doctoral_degree', 'educational_attainment_pop25', 'white', 'tenure_owner', 'tenure_total', 'commuter_PuT', 'commuter_total'])
chicago_data = chicago_data.drop(columns=['NAME', 'state', 'county','block group','tract', 'GEO_ID', 'GEOID',
                                                'bachelor_degree', 'master_degree', 'doctoral_degree', 'educational_attainment_pop25', 'white', 'tenure_owner', 'tenure_total', 'commuter_PuT', 'commuter_total'])


#chicago_data.to_file("data/chicago_data.geojson", driver='GeoJSON')
#chicago_data_GEOID.to_file("data/chicago_data_GEOID.geojson", driver='GeoJSON')
#chicago_data_filter.to_csv("data/chicago_data_filter.csv", index=False)
#chicago_data_filter_GEOID.to_csv("data/chicago_data_filter_GEOID.csv", index=False)


# get the bus stop imformation from SEPTA GTFS file
philly_stops = pd.read_csv("data/stops.txt")


# create a geodatafram
philly_stops = gpd.GeoDataFrame(philly_stops, geometry=gpd.points_from_xy(philly_stops.stop_lon, philly_stops.stop_lat)).set_crs("epsg:4326")
# change crs to epsg:3857
philly_stops = philly_stops.to_crs("epsg:3857")


# get block group gemetry from census data
# set geo server
acs.set_mapservice("tigerWMS_ACS2019")

# Use SQL to return geometries only for Cook County, IL
where_clause = f"STATE = {'42'} AND COUNTY = {'101'}"

# Query for block groups
philly_bg = acs.mapservice.layers[10].query(where=where_clause)

# we only need the geometry and GEOID for the block groups
philly_bg = philly_bg[['GEOID', 'geometry']]

# listing socioeconomics variables from 2019 5-year acs 
variables = [
    "NAME",
    "B01003_001E", # total pop
    "B25064_001E", # median rent
    "B19113_001E", # median income
    "B01002_001E", # median age
    "B15003_001E", # educational attainment for the pop 25 and over 
    "B15003_022E", # total bachelor's degree
    "B15003_023E", # master's degree                    
    "B15003_025E", # doctoral
   
    "B02001_002E", # white
    
    "B25003_001E", # tenure: total
    "B25003_002E", # tenure: owner
    
    "B08134_001E", # total commuters
    "B08134_061E", # commute by public transp
    
    "GEO_ID"
]

# query from the cenpy api
philly_data = acs.query(
    cols=variables,
    geo_unit="block group:*",
    geo_filter={"state": "42", 
                "county": "101", 
                "tract": "*"},
)

# rename the columns for easier references
dict = {"NAME": "NAME",
    "B01003_001E": "total_pop",
    "B25064_001E": "median_rent",
    "B19113_001E": "median_income",
    "B01002_001E": "median_age",
    "B15003_001E": "educational_attainment_pop25" ,
    "B15003_022E": "bachelor_degree",
    "B15003_023E": "master_degree"  ,                  
    "B15003_025E": "doctoral_degree",
    "B02001_002E": "white",
    "B25003_001E": "tenure_total",
    "B25003_002E": "tenure_owner",
    "B08134_001E": "commuter_total",
    "B08134_061E": "commuter_PuT",
    "GEO_ID": "GEO_ID"}

philly_data.rename(columns=dict, inplace=True)

# getting the GEOID for spatial join
philly_data["GEOID"] = [x[9:] for x in philly_data['GEO_ID']]

# merge the philly block group gemoetry with socioeconomic variables by GEOID
philly_data = philly_bg.merge(philly_data, on = "GEOID") 

# converting strings to numeric input
for variable in philly_data.columns:
    
    # Convert all variables
    if variable != "NAME" and variable != "GEO_ID" and variable != "geometry" and variable != "GEOID":
        philly_data[variable] = philly_data[variable].astype(float)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\pandas\core\dtypes\cast.py:1990: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'Polygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  result[:] = values
C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'Polygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out
C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\pyproj\crs\crs.py:68: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))


# data cleanning: replace error values with average values. 
## median rent: replace -666666666.0 in median rent with average. median rent of all tract.
philly_data['median_rent'] = philly_data['median_rent'].replace([-666666666.0],np.nan)
mean = philly_data['median_rent'].mean()
philly_data.loc[:,'median_rent'] = philly_data.loc[:,'median_rent'].fillna(mean)

## median income: replace -666666666.0 with mean 
philly_data['median_income'] = philly_data['median_income'].replace([-666666666.0],np.nan)
mean = philly_data['median_income'].mean()
philly_data.loc[:,'median_income'] = philly_data.loc[:,'median_income'].fillna(mean)

## median age: 
philly_data['median_age'] = philly_data['median_age'].replace([-666666666.0],np.nan)
mean = philly_data['median_age'].mean()
philly_data.loc[:,'median_age'] = philly_data.loc[:,'median_age'].fillna(mean)

## commuter_total:
philly_data['commuter_PuT'] = philly_data['commuter_PuT'].replace(np.nan, philly_data['commuter_PuT'].mean())


# data cleanning: replace error values with average values. 
## median rent: replace -666666666.0 in median rent with average. median rent of all tract.
philly_data['median_rent'] = philly_data['median_rent'].replace([-666666666.0],np.nan)
mean = philly_data['median_rent'].mean()
philly_data.loc[:,'median_rent'] = philly_data.loc[:,'median_rent'].fillna(mean)

## median income: replace -666666666.0 with mean 
philly_data['median_income'] = philly_data['median_income'].replace([-666666666.0],np.nan)
mean = philly_data['median_income'].mean()
philly_data.loc[:,'median_income'] = philly_data.loc[:,'median_income'].fillna(mean)

## median age: 
philly_data['median_age'] = philly_data['median_age'].replace([-666666666.0],np.nan)
mean = philly_data['median_age'].mean()
philly_data.loc[:,'median_age'] = philly_data.loc[:,'median_age'].fillna(mean)

## commuter_total:
philly_data['commuter_PuT'] = philly_data['commuter_PuT'].replace(np.nan, philly_data['commuter_PuT'].mean())


# percent of people who have college degree or above
philly_data["pct_college_degree"] = (philly_data["bachelor_degree"] + philly_data["master_degree"] + philly_data["doctoral_degree"]) / philly_data["educational_attainment_pop25"]
# replace nan with 0 since total can be 0
philly_data["pct_college_degree"] = philly_data["pct_college_degree"].replace(np.nan, 0)


# percent of racial minorities
philly_data["pct_racial_minorities"] = 1 - philly_data["white"] / philly_data["total_pop"]
# replace nan with 0 since total can be 0
philly_data["pct_racial_minorities"] = philly_data["pct_racial_minorities"].replace(np.nan, 0)


# percent of property owner
philly_data["pct_property_owner"] = philly_data['tenure_owner'] / philly_data['tenure_total']
# replace nan with 0 since total can be 0
philly_data["pct_property_owner"] = philly_data["pct_property_owner"].replace(np.nan, 0)


# percent of commuters using public transportation
philly_data["pct_commuter_PuT"] = philly_data['commuter_PuT'] / philly_data['commuter_total']
# replace nan with 0 since total can be 0
philly_data["pct_commuter_PuT"] = philly_data["pct_commuter_PuT"].replace(np.nan, 0)


# filter the df with needed numeric data 
philly_data_filter = philly_data.drop(columns=['NAME', 'state', 'county','block group','tract','geometry', 'GEO_ID', 
                                                'bachelor_degree', 'master_degree', 'doctoral_degree', 'educational_attainment_pop25', 'white', 'tenure_owner', 'tenure_total', 'commuter_PuT', 'commuter_total'])


# Download the Philadelphia city limits
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
philly_limit = esri2gpd.get(url).to_crs(epsg=3857)

# Get the geometry from the city limits
philly_limit_outline = philly_limit.to_crs(epsg=4326).squeeze().geometry


# get the number of residential building in each block group from OSM data
residential = ox.geometries_from_polygon(
    philly_limit_outline, tags={"building": "residential"}
)


# get the number of office building in each block group from OSM data
office = ox.geometries_from_polygon(
    philly_limit_outline, tags={"building": "office"}
)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'Polygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out
C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'MultiPolygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


def get_xy_from_geometry(df):
    # NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries 
    x = df.geometry.centroid.x
    y = df.geometry.centroid.y
    
    return np.column_stack((x, y)) # stack as columns


# STEP 1: x/y coordinates of restaurants (in EPGS=3857)
resXY = get_xy_from_geometry(residential.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=10)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(resXY)

# STEP 4: Get distances for stations to neighbors
resDists, resIndices = nbrs.kneighbors(get_xy_from_geometry(philly_data))

# STEP 5: add back to the original dataset
philly_data['DistRes'] = resDists.mean(axis=1)


# STEP 1: x/y coordinates of restaurants (in EPGS=3857)
resXY = get_xy_from_geometry(office.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=10)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(resXY)

# STEP 4: Get distances for stations to neighbors
officeDists, officeIndices = nbrs.kneighbors(get_xy_from_geometry(philly_data))

# STEP 5: add back to the original dataset
philly_data['DistOffice'] = officeDists.mean(axis=1)


# philly cbd
url = "http://data.phl.opendata.arcgis.com/datasets/95366b115d93443eae4cc6f498cb3ca3_0.geojson"
philly_cbd = gpd.read_file(url).to_crs(epsg=3857)

philly_cbd_limit = philly_cbd.squeeze().geometry

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'Polygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


# get the centroid of the cbd polygon to represent cbd
philly_cbd = philly_cbd_limit.centroid


philly_cbd = Point(philly_cbd.x, philly_cbd.y)


# get distance to cbd
philly_data['dist_to_cbd'] = philly_data.geometry.distance(philly_cbd)


# filter the df with needed numeric data 
philly_data_filter = philly_data.drop(columns=['NAME', 'state', 'county','block group','tract','geometry', 'GEO_ID', 
                                                'bachelor_degree', 'master_degree', 'doctoral_degree', 'educational_attainment_pop25', 'white', 'tenure_owner', 'tenure_total', 'commuter_PuT', 'commuter_total'])


# save the two dfs seperately
#philly_data.to_file("data/philly_data.geojson", driver='GeoJSON')
#philly_data_filter.to_csv('data/philly_data_filter.csv', index=False)


columns = list(chicago_data_filter.columns[:])

# Load panel and enable interactive features
pn.extension()

feature = pn.widgets.Select(name='feature', options=columns)
title = pn.Pane("<h1>Socioeconomic Features Explorer</h1>", width=1000)

@pn.depends(feature)
def plot(feature):
    return chicago_data_filter.hvplot.hist(
        feature,
        subplots=True, padding=0.1,
        responsive=True, min_height=500, size=100,
        c="#9ecae1"
    )


pn.Column(
    title,
    pn.Row(feature, plot),
    width_policy='max'
).servable()


m = folium.Map(
    location=[41.89051776994878, -87.62846577303873], tiles="Cartodb Positron", zoom_start=12
)

# convert to coordinates array
# Syntax is (lat, lng)
coordinates = bus_ridership_gdf[["latitude", "longitude"]].values

# add heat map
HeatMap(coordinates).add_to(m)

m


## K-means clustering
x = pn.widgets.Select(name='x', options=columns)
y = pn.widgets.Select(name='y', options=['boardings', 'alightings'])
n_clusters = pn.widgets.IntSlider(name='n_clusters', start=1, end=8, value=3)
title = pn.Pane("<h1>K-Means Clustering</h1>", width=1000)
@pn.depends(x.param.value, y.param.value, n_clusters.param.value)
def get_clusters(x, y, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    est = kmeans.fit(chicago_data_filter.iloc[:, :-1].values)
    chicago_data_filter['labels'] = est.labels_.astype('str')
    centers = chicago_data_filter.groupby('labels').mean()
    return (chicago_data_filter.sort_values('labels').hvplot.scatter(x, y, c='labels', cmap="Set3", size=100, height=500) *
            centers.hvplot.scatter(x, y, marker='x', color='black', size=400,
                                   padding=0.1, line_width=5))

pn.Column(
    '# K-Means Clustering',
    pn.Row(pn.WidgetBox(x, y, n_clusters), get_clusters)
).servable()


scaler = StandardScaler()
scaled_chicago_data = scaler.fit_transform(chicago_data_filter[['total_pop','median_income','median_rent',
                                                                'median_age','pct_college_degree','pct_racial_minorities',
                                                                'pct_property_owner','pct_commuter_PuT','DistRes','DistOffice','boardings']])

n_clusters = list(range(2, 8))

# Run kmeans for each value of k
inertias = []
for k in n_clusters:
    
    # Initialize and run
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(scaled_chicago_data)
    
    # Save the "inertia"
    inertias.append(kmeans.inertia_)
    
# Plot it!
plt.plot(n_clusters, inertias, marker='o', ms=10, mfc='white', lw=4, mew=3)
plt.title('Elbow Plot for Chicago socioeconomic variabels and boardings')

Text(0.5, 1.0, 'Elbow Plot for Chicago socioeconomic variabels and boardings')


# Initialize the knee algorithm
kn = KneeLocator(n_clusters, inertias, curve='convex', direction='decreasing')

# Print out the knee 
print(kn.knee)

3


var = chicago_data_filter_GEOID.columns

# Perform the k-means fit with n from 2 to 9
n_list = [i for i in range(2, 9)]
for i in n_list:
    kmeans = KMeans(n_clusters=i, random_state=42)

    scaler = StandardScaler()
    scaled_chicago_data = scaler.fit_transform(chicago_data_filter_GEOID[var[1:]])

    kmeans.fit(scaled_chicago_data)

    label = f"label{i}"
    chicago_data_filter_GEOID[label] = kmeans.labels_
    
# merge to have geometry
chicago_cluster = chicago_data_GEOID[['GEOID', 'geometry']].merge(chicago_data_filter_GEOID, left_on='GEOID', right_on='GEOID', how='left')


import param as pm
var_panel = list(var[1:])
class cluster(pm.Parameterized):
    n_clusters = pm.Integer(default=2, bounds=(2, 8))
    feature = pm.Selector(default='total_pop', objects=var_panel)
    @pm.depends("n_clusters")
    def choropleth(self):
        which_label = f"label{self.n_clusters}"
        data = chicago_cluster.sort_values(by=which_label)
        return chicago_cluster.hvplot.polygons(
            c=which_label,
            cmap="Set3",
            hover_cols=[which_label],
            frame_width=600,
            frame_height=550,
            geo=True, 
            crs=3857
        )
    @pm.depends("n_clusters", "feature")
    def look_at_clusters(self):
        # calculate average feature per cluster
        which_label = f"label{self.n_clusters}"
        data = chicago_cluster.sort_values(by=which_label)
        cluster_info = chicago_data_filter_GEOID.groupby(which_label, as_index=False).mean().sort_values(by=which_label)
        return cluster_info.hvplot.bar(y=self.feature, x=which_label, c=which_label, width=300, cmap="Set3")
    
    
app = cluster(name="")
title = pn.Pane("<h1>Cluster Analysis of Chicago Bus Ridership</h1>")
panel = pn.Row(
    pn.Column(title, pn.Param(app.param, width=300), app.look_at_clusters),
    pn.Column(app.choropleth),
)

panel.servable()

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'MultiPolygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


## check correlation
# The features
plt.figure(figsize=(10,9))
feature_cols = [col for col in chicago_data_filter]
sns.heatmap(
    chicago_data_filter[feature_cols].corr(), 
    cmap="coolwarm", 
    annot=True, 
    vmin=-1, 
    vmax=1,
)
plt.title("Correlation Among Selected Predictors", fontsize = 20) # title with fontsize 20

Text(0.5, 1.0, 'Correlation Among Selected Predictors')


od_df = dd.read_csv("data/il_od_main_JT00_2019.csv")
od_df


# get the length of the od df
len(od_df)

5254115


od_df['w_geocode'] = od_df['w_geocode'].astype(str)
od_df['h_geocode'] = od_df['h_geocode'].astype(str)
od_df.head()


# get the GEOID of block groups
od_df['w_geocode'] = od_df['w_geocode'].str[:12]
od_df['h_geocode'] = od_df['h_geocode'].str[:12]
od_df.head()


chicago_data_GEOID = gpd.read_file('data/chicago_data_GEOID.geojson').set_crs("epsg:3857", allow_override=True,)
chicago_data_GEOID.crs

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'Polygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out

<Projected CRS: EPSG:3857>
Name: WGS 84 / Pseudo-Mercator
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: World between 85.06°S and 85.06°N.
- bounds: (-180.0, -85.06, 180.0, 85.06)
Coordinate Operation:
- name: Popular Visualisation Pseudo-Mercator
- method: Popular Visualisation Pseudo Mercator
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich


chicago_bg_GEOID = list(chicago_data_GEOID['GEOID'])
od_df = od_df[od_df['w_geocode'].isin(chicago_bg_GEOID)]
od_df = od_df[od_df['h_geocode'].isin(chicago_bg_GEOID)]


# group by w_geocode
demand = od_df.groupby(['w_geocode','h_geocode'])['S000'].sum().compute()


demand = demand.reset_index()
demand = demand.sort_values(["S000"], ascending = False)
demand


w_sum = demand.groupby(["w_geocode"])["S000"].sum().reset_index()
w_sum


h_sum = demand.groupby(["h_geocode"])["S000"].sum().reset_index()
h_sum


# highest employment number for the destination 
w_GEO = w_sum[['w_geocode', 'S000']].merge(chicago_data_GEOID[["GEOID", "geometry"]], left_on='w_geocode', right_on='GEOID', how='left')
w_GEO = gpd.GeoDataFrame(w_GEO).set_crs("epsg:3857")


# highest employment number for the origin
h_GEO = h_sum[['h_geocode', 'S000']].merge(chicago_data_GEOID[["GEOID", "geometry"]], left_on='h_geocode', right_on='GEOID', how='left')
h_GEO = gpd.GeoDataFrame(h_GEO).set_crs("epsg:3857")


%%opts WMTS [width=800, height=800, xaxis=None, yaxis=None]

# number of job in each block group based on OD data (destination)
choro = w_GEO.hvplot(c='S000', 
                     width=800,
                     height=600,
                     alpha=0.8,
                     geo=True, 
                     cmap='viridis',
                     crs = 3857,
                     hover_cols=['w_geocode'],
                     title = 'Number of Destinations in Each Block Group')
gvts.ESRI * choro

WARNING:param.ParserWarning: Use method 'warning' via param namespace 
WARNING:param.ParserWarning:Use method 'warning' via param namespace 
WARNING:param.Warning: Ignoring keyword pair that fails to evaluate: 'width=800'
WARNING:param.Warning:Ignoring keyword pair that fails to evaluate: 'width=800'
WARNING:param.ParserWarning: Use method 'warning' via param namespace 
WARNING:param.ParserWarning:Use method 'warning' via param namespace 
WARNING:param.Warning: Ignoring keyword pair that fails to evaluate: 'height=800'
WARNING:param.Warning:Ignoring keyword pair that fails to evaluate: 'height=800'
WARNING:param.ParserWarning: Use method 'warning' via param namespace 
WARNING:param.ParserWarning:Use method 'warning' via param namespace 
WARNING:param.Warning: Ignoring keyword pair that fails to evaluate: 'xaxis=None'
WARNING:param.Warning:Ignoring keyword pair that fails to evaluate: 'xaxis=None'
WARNING:param.ParserWarning: Use method 'warning' via param namespace 
WARNING:param.ParserWarning:Use method 'warning' via param namespace 
WARNING:param.Warning: Ignoring keyword pair that fails to evaluate: 'yaxis=None'
WARNING:param.Warning:Ignoring keyword pair that fails to evaluate: 'yaxis=None'
C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'MultiPolygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


%%opts WMTS [width=800, height=800, xaxis=None, yaxis=None]

# number of job to go to, from each block group (origin)
choro = h_GEO.hvplot(c='S000', 
                     width=800, 
                     height=600,
                     alpha=0.8,
                     geo=True, 
                     cmap="viridis",
                     crs = 3857,
                     hover_cols=['h_geocode'],
                     title = 'Number of Origins in Each Block Group')
gvts.ESRI * choro

WARNING:param.ParserWarning: Use method 'warning' via param namespace 
WARNING:param.ParserWarning:Use method 'warning' via param namespace 
WARNING:param.Warning: Ignoring keyword pair that fails to evaluate: 'width=800'
WARNING:param.Warning:Ignoring keyword pair that fails to evaluate: 'width=800'
WARNING:param.ParserWarning: Use method 'warning' via param namespace 
WARNING:param.ParserWarning:Use method 'warning' via param namespace 
WARNING:param.Warning: Ignoring keyword pair that fails to evaluate: 'height=800'
WARNING:param.Warning:Ignoring keyword pair that fails to evaluate: 'height=800'
WARNING:param.ParserWarning: Use method 'warning' via param namespace 
WARNING:param.ParserWarning:Use method 'warning' via param namespace 
WARNING:param.Warning: Ignoring keyword pair that fails to evaluate: 'xaxis=None'
WARNING:param.Warning:Ignoring keyword pair that fails to evaluate: 'xaxis=None'
WARNING:param.ParserWarning: Use method 'warning' via param namespace 
WARNING:param.ParserWarning:Use method 'warning' via param namespace 
WARNING:param.Warning: Ignoring keyword pair that fails to evaluate: 'yaxis=None'
WARNING:param.Warning:Ignoring keyword pair that fails to evaluate: 'yaxis=None'
C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'MultiPolygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


# get the employment destination that has the top 10 employment number
dest_bg = chicago_data_GEOID[chicago_data_GEOID['GEOID'].isin(list(demand["w_geocode"])[:10])]


# get the employment origin that has the top 10 employment number
origin_bg = chicago_data_GEOID[chicago_data_GEOID['GEOID'].isin(list(demand["h_geocode"])[:10])]


# plot the top 10 employment OD pair origins and destinations
# red is the desination block group, blue are the origin block groups
fig, ax = plt.subplots(figsize=(12, 12))
chicago_data_GEOID.plot(ax=ax, facecolor='none', edgecolor='black', linewidth = 0.1)
origin_bg.plot(ax=ax, facecolor='none', edgecolor='blue')
dest_bg.plot(ax=ax, facecolor='none', edgecolor='red')
ax.set_axis_off()
ax.set_aspect("equal")
plt.show()


chicago_data_filter = chicago_data_filter.drop(columns = ["labels", "alightings"])

# Split the data 60/40
train_set, test_set = train_test_split(chicago_data_filter, test_size = 0.4, random_state = 1)

# create the target variables
y_train = train_set["boardings"].values
y_test = test_set["boardings"].values

# The feature variables
feature_cols = [col for col in chicago_data_filter.columns if col not in ["boardings"]]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values


# linear regression
# create a pipe to scale the data and perform linear regression
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())

# fit the pipeline
print("Linear regression")
linear_pipe.fit(X_train, y_train)

# get the score on training data
training_score = linear_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")

# get the score on test data
test_score = linear_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")

Linear regression
Training Score = 0.13458835641844735
Test Score = 0.15039660135311783


# random forest regression
# create a pipe to scale the data and fit a random forest
forest_pipe = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=1))

# find the best hyper parameter for the random forest
model_step = "randomforestregressor"
param_grid = {
    f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
    f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}

# Create the grid and use 3-fold CV
grid = GridSearchCV(forest_pipe, param_grid, cv=3)

# Run the search
grid.fit(X_train, y_train)

# get the best parameters
grid.best_params_

{'randomforestregressor__max_depth': 5,
 'randomforestregressor__n_estimators': 100}


# fit the model based on the hyperparameter selected
forest_pipe = make_pipeline(
    StandardScaler(), RandomForestRegressor(max_depth = 5, n_estimators = 100, random_state=1)
)

# Fit a random forest
print("Random forest")
forest_pipe.fit(X_train, y_train)

# Print the training score
training_score = forest_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")

# Print the test score
test_score = forest_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")

Random forest
Training Score = 0.40129303419226126
Test Score = 0.18813024786614485


# get the prediction from the best random forest model
predictions = grid.best_estimator_.predict(X_test)


# add the prediction result to the testset dataframe
# boardings should be integers
test_set['predictions'] = predictions.round(0)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\geodataframe.py:1322: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


# get the absolute percent err
test_set["APE"] = abs(test_set['predictions'] - test_set['boardings']).round(0)/(test_set['boardings'] + 0.1)


# get the geometry of test set
test_set['geometry'] = chicago_data[chicago_data.index.isin(test_set.index)]['geometry']

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\geodataframe.py:1322: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


# set crs to test_set
test_set = gpd.GeoDataFrame(test_set).set_crs('epsg:3857')
test_set


test_set2= test_set.to_crs(epsg=4326)
test_set2.hvplot(c='APE', 
                     frame_width=600, 
                     frame_height=500, 
                     geo=True, 
                     cmap='gnuplot2', 
                     hover_cols=['boardings','predictions','APE'],
                     title='Average Percent Error at Block Group Level')

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'MultiPolygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


philly_data_filter = philly_data_filter.drop(columns = ["GEOID"])


philly_data_filter.head()


# get the prediction from the best random forest model
philly_predictions = grid.best_estimator_.predict(philly_data_filter)


# add the prediction result to the philly df
# boardings should be integers
philly_data['predictions'] = philly_predictions.round(0)


# map the predictions
# convert dataset to crs: 4326
philly_data_plot= philly_data.to_crs(epsg=4326)

philly_data_plot.hvplot(c='predictions', 
                     frame_width=600, 
                     frame_height=500, 
                     geo=True, 
                     cmap='viridis', 
                     hover_cols=['GEOID','predictions'],
                     title='Predicted Bus Ridership in Philadelphia')

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\geopandas\_vectorized.py:142: DeprecationWarning: An exception was ignored while fetching the attribute `__array_interface__` from an object of type 'MultiPolygon'.  With the exception of `AttributeError` NumPy will always raise this exception in the future.  Raise this deprecation warning to see the original exception. (Warning added NumPy 1.21)
  aout[:] = out


philly_data.head(3)


chicago_data_filter.head(3)


def init_map(lat=39.99, long=-75.13, zoom_start=11):
    return fm.Map(location=[lat,long], zoom_start=zoom_start)


data_test = philly_data.to_crs(epsg=4326)
bus_ridership = philly_stops.to_crs(epsg=4326)


## choropleth for predictions
def choose_x(x='predictions'):    
    # Minimum
    min_val = data_test[x].min()

    # Maximum
    max_val = data_test[x].max()

    # Calculate a normalized column
    normalized = (data_test[x] - min_val) / (max_val - min_val)

    # Add to the dataframe
    data_test["normalized"] = normalized
 
# Use a red-purple colorbrewer color scheme
cmap = plt.get_cmap('PuBu')

def get_style(feature):
    # Get the data value from the feature
    value = feature['properties']['normalized']

    # Evaluate the color map
    # NOTE: value must between 0 and 1
    rgb_color = cmap(value) # this is an RGB tuple

    # Convert to hex string
    color = mcolors.rgb2hex(rgb_color)

    # Return the style dictionary
    return {'weight': 0.5, 'color': color, 'fillColor': color, "fillOpacity": 0.75}

def get_highlighted_style(feature):
    return {"weight": 3, "color": "black"}


columns = list(data_test.columns[3:])
columns.remove('GEO_ID')
columns.remove('state')
columns.remove('county')
columns.remove('tract')
columns.remove('block group')


class PanelFoliumMap(param.Parameterized):
    feature = param.Selector(default="predictions", objects=columns)

    def __init__(self, max_records=2500, name=""):
        super().__init__(name=name)

        TOR_COORDINATES = (
            pd.to_numeric(bus_ridership["stop_lat"]).mean(),
            pd.to_numeric(bus_ridership["stop_lon"]).mean(),
        )

        # subset to match subset of locations
        self.mc = MarkerCluster()

        # iterate through dataset to create clusters
        for row in bus_ridership[0:max_records].itertuples():
            self.mc.add_child(
                fm.Marker(location=[row.stop_lat, row.stop_lon], 
                          #popup=row.predictions
                         )
            )
            
    # Set default choropleth column
    choose_x()

    @param.depends("feature")
    def get_map(self):

        # Get new map
        self.map = init_map()
        
        # Add markercluster
        self.map.add_child(self.mc)
        
        # Set choropleth column
        choose_x(self.feature)


        # Add the GeoJson to the map
        folium.GeoJson(
            data_test[
                ["geometry", "normalized", self.feature]
            ].to_json(),  # IMPORTANT: make sure CRS is lat/lng (EPSG=4326)
            style_function=get_style,
            tooltip=folium.GeoJsonTooltip([self.feature]),
        ).add_to(self.map)

        return pn.Pane(self.map, width=1000, height=500)


app = PanelFoliumMap(name="")


# The title
title = pn.Pane("<h1>Predicted Bus Ridership in Philadelphia by Block Group</h1>", width=1000)
# Layout the dashboard
panel = pn.Column(
    pn.Row(title),
    pn.Row(pn.Column(app.param.feature, app.get_map)),
)
panel.servable()

	w_geocode	h_geocode	S000	SA01	SA02	SA03	SE01	SE02	SE03	SI01	SI02	SI03	createdate
npartitions=6
	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64
	...	...	...	...	...	...	...	...	...	...	...	...	...
...	...	...	...	...	...	...	...	...	...	...	...	...	...
	...	...	...	...	...	...	...	...	...	...	...	...	...
	...	...	...	...	...	...	...	...	...	...	...	...	...

	w_geocode	h_geocode	S000	SA01	SA02	SA03	SE02	SE03	SI01	SI02	createdate
0	170010001001001	170010002021001	1	0	1	0	1	0	1	0	20211018
1	170010001001001	170010004003001	1	0	0	1	1	0	1	0	20211018
2	170010001001001	170010004003004	1	1	0	0	1	0	1	0	20211018
3	170010001001001	170010004003020	1	0	1	0	0	1	0	1	20211018
4	170010001001001	170010008002079	1	0	1	0	0	1	0	1	20211018

	w_geocode	h_geocode	S000	SA01	SA02	SA03	SE02	SE03	SI01	SI02	createdate
0	170010001001	170010002021	1	0	1	0	1	0	1	0	20211018
1	170010001001	170010004003	1	0	0	1	1	0	1	0	20211018
2	170010001001	170010004003	1	1	0	0	1	0	1	0	20211018
3	170010001001	170010004003	1	0	1	0	0	1	0	1	20211018
4	170010001001	170010008002	1	0	1	0	0	1	0	1	20211018

	w_geocode	h_geocode	S000
303610	170318391001	170318391001	841
301794	170318391001	170310818001	784
302386	170318391001	170312801001	716
302475	170318391001	170313301003	647
303611	170318391001	170318391002	541
...	...	...	...
130306	170312413001	170312515004	1
130308	170312413001	170312519004	1
130309	170312413001	170312520004	1
130310	170312413001	170312602001	1
340899	170319801001	170318439001	1

	w_geocode	S000
0	170310101001	4
1	170310101002	146
2	170310101003	84
3	170310102011	2
4	170310102012	30
...	...	...
2246	170318439002	14
2247	170318439003	24
2248	170318439005	6
2249	170319800001	13707
2250	170319801001	2832

Bus Ridership Project¶

Introduction¶

Preparation¶

Library import¶

Data preparation - Chicago¶

Ridership data¶

Socioeconomic data¶

Data from Census¶

Stop-level Ridership data to Block-group level data¶

Data cleaning¶

Feature Engineering¶

Save chicago_data¶

Data preparation - Philadelphia¶

Bus stops data¶

Socioeconomic data¶

Data from census¶

Data Cleaning¶

Feature engineering¶

Save philadelphia data¶

Data Exploration¶

Distribution¶

K-means clustering¶

Example : Number of clusters to try out for scaled Chicago dataset¶

Clustering Analysis on the Map¶

Correlation¶

OD Employment Statistics Analysis¶

Regression and Prediction¶

Training and testing on Chicago¶

Linear Regression¶

Random Forest¶

Model Performance - Test on Chicago¶

Predict on Philadelphia¶

Use Case Scenario¶

Conclusion¶

	total_pop	median_rent	median_income	median_age	boardings	pct_college_degree	pct_racial_minorities	pct_property_owner	pct_commuter_PuT	DistRes	DistOffice	dist_to_cbd	predictions	APE	geometry
579	1338.0	909.00000	53947.0	48.1	117.0	0.258065	0.990284	0.000000	0.252061	6150.318691	4158.909659	20261.820550	94.0	0.196413	POLYGON ((-9752295.497 5117359.253, -9752269.0...
1355	2519.0	953.00000	73719.0	39.2	41.0	0.376339	0.563716	0.321966	0.369504	2562.466929	5162.418054	26855.881041	62.0	0.510949	POLYGON ((-9759527.813 5163882.017, -9759514.6...
47	595.0	1145.33871	95709.0	59.4	69.0	0.179641	0.981513	0.910714	0.000000	3005.443999	7240.532573	22819.920661	25.0	0.636758	POLYGON ((-9756292.423 5114464.658, -9756265.3...
1118	1362.0	881.00000	48924.0	26.1	229.0	0.042614	0.444934	0.558442	0.320373	4303.665486	4860.130876	3786.388539	77.0	0.663466	POLYGON ((-9765441.104 5137126.842, -9765413.8...
614	953.0	1860.00000	250001.0	32.1	66.0	0.726008	0.082896	0.533613	0.445759	924.054432	2588.497052	11380.338084	101.0	0.529501	POLYGON ((-9758735.663 5148549.756, -9758724.4...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
399	1028.0	488.00000	82232.0	37.9	62.0	0.431849	0.902724	0.568035	0.113253	481.614644	3854.924978	9165.207907	78.0	0.257649	POLYGON ((-9752350.934 5134521.737, -9752345.2...
51	1229.0	1145.33871	49659.0	29.1	18.0	0.141243	0.423108	0.914474	0.221782	2237.732038	4869.450346	1250.996832	58.0	2.209945	POLYGON ((-9763236.533 5135433.600, -9763230.1...
771	1158.0	1008.00000	54821.0	40.0	51.0	0.061538	0.917962	0.382514	0.167391	4256.686971	4498.601899	6326.311177	71.0	0.391389	POLYGON ((-9766356.484 5141671.975, -9766329.7...
640	1679.0	1432.00000	115066.0	38.6	0.0	0.202064	0.184634	0.912109	0.171705	3274.690705	8409.151415	11899.806461	27.0	270.000000	POLYGON ((-9771419.406 5129935.357, -9771392.2...
1344	791.0	987.00000	53707.0	46.4	28.0	0.195082	0.900126	0.520468	0.294798	5618.142914	4795.509443	5187.849994	75.0	1.672598	POLYGON ((-9766570.218 5139175.031, -9766502.2...

	total_pop	median_rent	median_income	median_age	pct_college_degree	pct_racial_minorities	pct_property_owner	pct_commuter_PuT	DistRes	DistOffice	dist_to_cbd
0	1300.0	1183.0	117708.0	45.2	0.543067	0.333846	0.740260	0.192708	3196.524945	11617.133832	11787.190034
1	1251.0	1611.0	133750.0	54.3	0.698367	0.198241	0.474684	0.197239	6928.148472	16058.116135	17684.533341
2	961.0	795.0	42292.0	26.6	0.059259	0.378772	0.455497	0.197640	7210.987890	12194.578556	16439.070722
3	646.0	1297.0	45179.0	51.3	0.259857	0.792570	0.689320	0.498433	2093.030170	2374.516679	2946.258058
4	416.0	306.0	60208.0	48.2	0.117816	0.641827	0.336683	0.723270	4844.692909	13360.405011	19738.013889

	GEOID	geometry	NAME	total_pop	median_rent	median_income	median_age	educational_attainment_pop25	bachelor_degree	master_degree	...	tract	block group	pct_college_degree	pct_racial_minorities	pct_property_owner	pct_commuter_PuT	DistRes	DistOffice	dist_to_cbd	predictions
0	421010235001	POLYGON ((-8372329.774 4872783.269, -8372320.2...	Block Group 1, Census Tract 235, Philadelphia ...	1300.0	1183.0	117708.0	45.2	952.0	310.0	167.0	...	23500.0	1.0	0.543067	0.333846	0.740260	0.192708	3196.524945	11617.133832	11787.190034	33.0
1	421010385002	POLYGON ((-8374266.956 4878192.096, -8374260.0...	Block Group 2, Census Tract 385, Philadelphia ...	1251.0	1611.0	133750.0	54.3	1041.0	406.0	264.0	...	38500.0	2.0	0.698367	0.198241	0.474684	0.197239	6928.148472	16058.116135	17684.533341	44.0
2	421010325004	POLYGON ((-8354346.890 4869327.442, -8354151.0...	Block Group 4, Census Tract 325, Philadelphia ...	961.0	795.0	42292.0	26.6	540.0	32.0	0.0	...	32500.0	4.0	0.059259	0.378772	0.455497	0.197640	7210.987890	12194.578556	16439.070722	41.0

	total_pop	median_rent	median_income	median_age	boardings	pct_college_degree	pct_racial_minorities	pct_property_owner	pct_commuter_PuT	DistRes	DistOffice	dist_to_cbd
0	837.0	1145.33871	145652.0	44.0	2.0	0.305772	0.313023	0.911392	0.030242	7980.815698	7057.603946	16046.632311
1	1053.0	1145.33871	104167.0	38.9	14.0	0.376884	0.308642	0.890374	0.051111	2936.027534	8176.823160	11561.888968
2	885.0	1145.33871	93750.0	36.7	27.0	0.321976	0.367232	0.909091	0.009828	2432.606717	7862.064990	11117.370191

	h_geocode	S000
0	170310101001	166
1	170310101002	561
2	170310101003	578
3	170310102011	395
4	170310102012	1074
...	...	...
2315	170318439003	212
2316	170318439004	151
2317	170318439005	231
2318	170319800001	96
2319	170319801001	61