import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import altair as alt  

dat = pd.read_csv("crash_data_collision_crash_2007_2017.csv")
dat.columns

Index(['objectid_1', 'objectid', 'crn', 'district', 'county', 'municipality',
       'crash_year', 'crash_month', 'day_of_week', 'time_of_day',
       'hour_of_day', 'illumination', 'weather', 'road_condition',
       'collision_type', 'relation_to_road', 'work_zone_type', 'work_zone_loc',
       'work_zone_ind', 'intersect_type', 'tcd_type', 'location_type',
       'urban_rural', 'fatal_count', 'injury_count', 'person_count',
       'total_units', 'sch_bus_ind', 'sch_zone_ind', 'latitude', 'longitude',
       'arrival_tm', 'cons_zone_spd_lim', 'dispatch_tm', 'est_hrs_closed',
       'lane_closed', 'ln_close_dir', 'ntfy_hiwy_maint', 'spec_juris_cd',
       'tcd_func_cd', 'tfc_detour_ind', 'workers_pres', 'wz_close_detour',
       'wz_flagger', 'wz_law_offcr_ind', 'wz_ln_closure', 'wz_moving',
       'wz_other', 'wz_shlder_mdn', 'vehicle_count', 'automobile_count',
       'motorcycle_count', 'bus_count', 'small_truck_count',
       'heavy_truck_count', 'suv_count', 'van_count', 'bicycle_count',
       'maj_inj_count', 'mod_inj_count', 'min_inj_count', 'tot_inj_count',
       'unk_inj_deg_count', 'unk_inj_per_count', 'driver_count_16yr',
       'driver_count_17yr', 'driver_count_18yr', 'driver_count_19yr',
       'driver_count_20yr', 'driver_count_50_64yr', 'driver_count_65_74yr',
       'driver_count_75plus', 'unbelted_occ_count', 'unb_death_count',
       'unb_maj_inj_count', 'belted_death_count', 'belted_maj_inj_count',
       'mcycle_death_count', 'mcycle_maj_inj_count', 'bicycle_death_count',
       'bicycle_maj_inj_count', 'ped_count', 'ped_death_count',
       'ped_maj_inj_count', 'max_severity_level', 'comm_veh_count', 'dec_lat',
       'dec_long', 'rdwy_surf_type_cd', 'county_name'],
      dtype='object')


# create a new column that represents if the accident is associated with intersection
# data dictionary referenced dataset's metadata
dict ={0:'no',
       1:'yes',
       2:'yes',
       3:'yes',
       4:'no', 
       5:'yes',
       6:'no',
       7:'no',
       8:'yes',
       9:'no',
       10:'no',
       11:'no'
      }
dat['intersection'] = dat['intersect_type'].replace(dict)
dict ={1:"weekday",
       2:"weekday",
       3:"weekday",
       4:"weekday",
       5:"weekday",
       6:"weekend",
       7:"weekend"}
dat['weekend'] = dat['day_of_week'].replace(dict)

# select only columns we are interested in
wanted = ['crash_year', 'crash_month', 'weekend', 'intersection', "ped_count", "ped_death_count", "ped_maj_inj_count", "hour_of_day", "weather", "tcd_type"]
df_intersection = dat[wanted]


# Calculate the mean fatal crash by intersection type
df_intersection_size = df_intersection.groupby(['crash_year', 'intersection']).size().reset_index(name='counts')
year = df_intersection_size['crash_year'].unique()
no = list(df_intersection_size.loc[df_intersection_size['intersection']=='no']['counts'])
yes = list(df_intersection_size.loc[df_intersection_size['intersection']=='yes']['counts'])


fig, ax = plt.subplots()

pa = ax.bar(year, yes, label='at intersection', color="#d8b365")
pb = ax.bar(year, no, 
       bottom=yes, label='not at intersection', color="#5ab4ac")

plt.suptitle('Traffic Crashes 2011-2017 in Philadelphia, PA')
ax.set_title('at or not at intersection', fontsize=10)

ax.legend()

ax.bar_label(pa, label_type='center')
ax.bar_label(pb, label_type='center')

ax.set_ylabel('Count of Crashes')
ax.set_xlabel('Year')

Text(0.5, 0, 'Year')


df_hour = df_intersection.groupby(["hour_of_day", "intersection", "crash_year", "weekend"]).size().reset_index(name='counts')
df_hour = df_hour.loc[df_hour["hour_of_day"] != 99]

mypalette = sns.color_palette("Set2")
g = sns.catplot(x="hour_of_day", 
                y="counts",  
                col="weekend", 
                hue="intersection", 
                ci=None,
                palette=mypalette,
                markers=["^", "o"], 
                linestyles=["-", "--"],
                kind="point", 
                theme=sns.set_theme(style="darkgrid"),
                data=df_hour)
g.set_titles("Total Traffic Crashes 2011-2017 in Philadelphia PA \n across hour of the day \n on {col_name}")
g.set_axis_labels("Hour of day", "Count of crash")
line_position = [7, 7]
for ax, pos in zip(g.axes.flat, line_position):
    ax.axvline(x=pos, color='y', linestyle=':'),
line_position = [22, 22]
for ax, pos in zip(g.axes.flat, line_position):
    ax.axvline(x=pos, color='b', linestyle=':'),


import altair as alt


# average # of ped associated with hour of day
df_hour = df_intersection.groupby(["hour_of_day", "intersection", "crash_year"]).sum().reset_index()
df_hour = df_hour.loc[df_hour["hour_of_day"] != 99]


# Setup a brush selection
brush = alt.selection(type='interval')

# The top line plot: average daily accident each year
bars = (
    alt.Chart(title="Average daily pedestrian involved crash by year").mark_bar().transform_calculate(
    byDay='( datum.ped_count / 365)',
)   .encode(
        x=alt.X('crash_year:O', scale=alt.Scale(zero=False), axis=alt.Axis(title='year')),
        y=alt.Y('sum(byDay):Q', scale=alt.Scale(zero=False), axis=alt.Axis(title='average number')),
        color=alt.condition(brush, 'intersection:O', alt.value('lightgray'), scale=alt.Scale(scheme="Set2")),
        tooltip=["crash_year:O", "sum(byDay):Q", "intersection:O"],
    ).properties(
        selection=brush,
        width=800
    )
)

# the bottom line plot: average accident by hour of day
# Create a selection that chooses the nearest point & selects based on x-value
lines = (
    alt.Chart(title="Pedestrian involved by hour of the day")
    .mark_line().transform_calculate(
    byDay='( datum.ped_count / 365)',
)
    .encode(
        x=alt.X('hour_of_day:O', axis=alt.Axis(title='hour of the day')),
        y=alt.Y('sum(byDay):Q', axis=alt.Axis(title='average number')),
        color='intersection:O',
    ).transform_filter(
        brush.ref()
    ).properties(width=800)
)

chart = alt.vconcat(bars, lines, data=df_hour) # vertical stacking

chart


df_month = df_intersection.groupby(["crash_month", "weather"]).size().reset_index(name='counts')
#df_month = df_month.loc[df_month["crash_month"] != 99]

# data dictionary referenced dataset's metadata
dict ={0:'Unknown',
       1:'No adverse conditions',
       2:'Rain',
       3:'Sleet (hail)',
       4:'Snow', 
       5:'Fog',
       6:'Rain and fog',
       7:'Sleet and fog',
       8:'Other',
       9:'Unknown',
      }

df_month['weather'] = df_month['weather'].replace(dict)


chart = (
   alt.Chart(df_month, title="Crash and weather")
   .mark_rect()
   .encode(
       x=alt.X("crash_month:O", axis=alt.Axis(title=None, ticks=False)),
       y=alt.Y("weather:O", axis=alt.Axis(title=None, ticks=False)),
       color=alt.Color("counts:Q", sort="ascending", scale=alt.Scale(scheme="oranges")),
       tooltip=["crash_month", "weather", "counts"],
   )
   .properties(width=700, height=500)
)

chart


df_tcd = df_intersection.groupby(["crash_year", "tcd_type", "intersection"]).size().reset_index(name='counts')
#df_month = df_month.loc[df_month["crash_month"] != 99]

# data dictionary referenced dataset's metadata
dict ={0: "Not applicable",
       1: "Flashing traffic signal",
       2: "Traffic signal",
       3: "Stop sign",
       4: "Yield sign",
       5: "Active RR crossing controls",
       6: "Passive RR crossing controls",
       7: "Police officer or flagman",
       8: "Other Type TCD",
       9: "Unknown"
      }

df_tcd['tcd_type'] = df_tcd['tcd_type'].replace(dict)


alt.Chart(df_tcd, title="Crash and transportation control device type").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x='crash_year:O',
    y=alt.X('sum(counts):Q', stack="normalize",  axis=alt.Axis(title='share of counts')),
    color=alt.Color('tcd_type:O', scale=alt.Scale(scheme="accent")),
    column='intersection:N',
    tooltip=["crash_year", "counts", "tcd_type"],
)

Exploratory Data Visualization on Traffic Crashes¶

Part I: Selecting a dataset¶

Part II: Exploring and visualizing the data¶

General trend of crashes¶

motivation:¶

discussion¶

Crashes by the time of the day¶

motivation:¶

Discussion¶

More on the crases¶

motivation:¶

discussion¶

motivation¶

discussion¶

motivation¶

discussion¶

Exploratory Data Visualization on Traffic Crashes¶

Part I: Selecting a dataset¶

Part II: Exploring and visualizing the data¶

General trend of crashes¶

motivation:¶

discussion¶

Crashes by the time of the day¶

motivation:¶

Discussion¶

More on the crases¶

How are crashes related to pedestrians¶

motivation:¶

discussion¶

How is crashes related to weather¶

motivation¶

discussion¶

How are crashes related to traffic signal¶

motivation¶

discussion¶