# install dependencies
!pip3 install pandas
!pip3 install folium
!pip3 install matplotlib
!pip3 install seaborn
!pip3 install scikit-learn

# load dependencies

import pandas as pd # for data wrangling
from IPython.display import display # for table of question id descriptives
import folium # mapping 
from folium.plugins import HeatMap # for heatmap
import matplotlib # dependency for heatmap
import matplotlib.pyplot as plt # for graphing
import seaborn as sns # for graphing
import geopandas as gpd # for graphing neighbourhoods
from shapely.geometry import Point # for graphing neighbourhoods
from branca.element import Element # for titling neighbourhoods graph

# load data
df_full = pd.read_csv('TSL_rmr_sample.csv').drop('Unnamed: 0', axis = 1) # remove redundant index column

# load codebook data
codebook = pd.read_csv('codebook.csv', 
                       names = ['question_id', 'answer', 'question_title', 'answer_title', 'SCORE'], 
                       header = 0)

# cull the 'v2/3/4' from df question_ids, for simplicity
df_full['question_id'] = df_full['question_id'].str.replace('_v\d+', '', regex = True)

# Ensure codebook has unique 'question_title' and 'answer_title' combinations, otherwise we get duplicated rows with the merge
codebook_unique = codebook.drop_duplicates(subset=['question_title', 'answer_title'])

# merge scores with full data by question and answer title
df = pd.merge(df_full,
              codebook_unique[['question_title', 'answer_title', 'SCORE']],
              on = ['question_title', 'answer_title'],
              how = 'left')

# manually override some merging issues with variables of interest
# graffiti
df.loc[(df['question_id'] == 'stop_amenities_graffiti') & (df['answer_title'] == 'Not really'), 'SCORE'] = 1.0
df.loc[(df['question_id'] == 'stop_amenities_graffiti') & (df['answer_title'] == "Yes but it's art"), 'SCORE'] = 3.0
df.loc[(df['question_id'] == 'stop_amenities_graffiti') & (df['answer_title'] == 'Yes it’s vandalism'), 'SCORE'] = 5.0

# trash can fill level
df.loc[(df['question_id'] == 'stop_amenities_trash_can_fill_level') & (df['answer_title'] == 'Very full'), 'SCORE'] = 1.0
df.loc[(df['question_id'] == 'stop_amenities_trash_can_fill_level') & (df['answer_title'] == 'Looks okay'), 'SCORE'] = 3.0

# stop cleanliness
df.loc[(df['question_id'] == 'stop_cleanliness') & (df['answer_title'] == 'Needs cleaning'), 'SCORE'] = 1.0

# VARIABLE, N, N_MISSING, PERCENT_MISSING
summary = pd.DataFrame({
    'VARIABLE': df.columns,
    'N': len(df),
    'N_MISSING': df.isnull().sum().values,
    'PERCENT_MISSING': (df.isnull().sum() / len(df) * 100).values
})

# Emphasize percent missingness
styled_summary = summary.style.background_gradient(
    subset = ['PERCENT_MISSING', 'N_MISSING'], cmap = 'Reds'
).format('{:.2f}', subset = ['PERCENT_MISSING'])
styled_summary

# Highlight questions of interest
question_ids = [
    'stop_cleanliness',
    'stop_amenities_graffiti',
    'stop_amenities_broken_glass',
    'stop_amenities_trash',
    'stop_amenities_trash_can',
    'stop_amenities_trash_can_fill_level'
]

# Filter dataframe
df_filtered = df[df['question_id'].isin(question_ids)]

def text_bar(value, max_value, length=10):
    # creates text bars that visualize distributions
    filled_blocks = int(round(value / max_value * length))
    return '█' * filled_blocks + '░' * (length - filled_blocks)

def answer_distribution(group):
    # generates formatted string with response distributions + bars
    max_count = group['N'].max()
    max_answer_len = max(len(str(answer)) for answer in group['SCORE'])

    return '\n'.join(
        f'{str(answer).ljust(max_answer_len)}: {text_bar(count, max_count)} ({count})'
        for answer, count in zip(group['SCORE'], group['N'])
    )

# Aggregate counts
summary = df_filtered.groupby(['question_title', 'SCORE']).size().reset_index(name='N')

# Create summary table with the distribution as bars in one column
summary_combined = summary.groupby('question_title', include_groups=False).apply(
    lambda g: pd.Series({
        'N_TOTAL': g['N'].sum(),
        'ANSWER': answer_distribution(g)
    })
).reset_index()

# Sort by N_TOTAL in descending order
summary_combined = summary_combined.sort_values(by='N_TOTAL', ascending=False)

# Display the result with bar charts and better styling
display(
    summary_combined.style.set_properties(
        subset=['ANSWER'], **{'text-align': 'left', 'white-space': 'pre-wrap', 'font-family': 'monospace'}
    ).set_table_styles(
        [{'selector': 'td', 'props': [('vertical-align', 'top')]}]
    )
)

/var/folders/2t/89hlcw6578l1wf66sdnjm57m0000gn/T/ipykernel_54246/1778096359.py:33: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  summary_combined = summary.groupby('question_title').apply(

# Group by route_short_name (Line) and trip_id to get the number of trips
line_summary = df_filtered.groupby('route_short_name').agg(
    Stops_per_Line=('stop_id', 'nunique'), # Number of stops per line
    Trips_per_Line=('user_trip_id', 'nunique'),  # Unique trips per line
    Unique_Riders=('hashed_user_id', 'nunique'),  # Unique riders per line
    Total_Questions_Answered=('question_id', 'count'),  # Total questions answered for the trips on that line
    Total_Rows=('question_id', 'size')  # Total rows for each line
).reset_index()

# Calculate the average number of questions answered per line
line_summary['Average_Questions_Per_Trip'] = line_summary['Total_Questions_Answered'] / line_summary['Trips_per_Line']

# Drop the 'Total_Questions_Answered' and 'Total_Rows' columns
line_summary = line_summary.drop(columns=['Total_Questions_Answered', 'Total_Rows'])

# Display the final table with the necessary columns
line_summary = line_summary[['route_short_name', 'Stops_per_Line', 'Trips_per_Line', 'Unique_Riders', 'Average_Questions_Per_Trip']]
line_summary.columns = ['Line', 'Stops per Line', 'Trips per Line', 'Unique Riders', 'Average Questions per Trip']

# Sort the table by 'Trips per Line' in descending order
line_summary = line_summary.sort_values(by='Trips per Line', ascending=False)

# Round the values to 2 decimal places
line_summary = line_summary.round(2)

# Calculate the totals and add the average row
total_median_row = {
    'Line': '*OVERALL MEDIAN*',
    'Stops per Line': line_summary['Stops per Line'].median().round(2),
    'Trips per Line': line_summary['Trips per Line'].median().round(2),
    'Unique Riders': line_summary['Unique Riders'].median().round(2),
    'Average Questions per Trip': line_summary['Average Questions per Trip'].median().round(2)
}

total_mean_row = {
    'Line': '*OVERALL AVERAGE*',
    'Stops per Line': line_summary['Stops per Line'].mean().round(2),
    'Trips per Line': line_summary['Trips per Line'].mean().round(2),
    'Unique Riders': line_summary['Unique Riders'].mean().round(2),
    'Average Questions per Trip': line_summary['Average Questions per Trip'].mean().round(2)
}

# Append the total row to the DataFrame using pd.concat
total_row_df = pd.DataFrame([total_median_row, total_mean_row])
line_summary = pd.concat([line_summary, total_row_df], ignore_index=True)

# Display the table
display.display(line_summary)

# Create a 2x2 grid for the subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Plot Histogram of Stops per Line
axs[0, 0].hist(line_summary['Stops per Line'], bins=20, color='orange', edgecolor='black')
axs[0, 0].set_title('Histogram of Stops per Line')
axs[0, 0].set_xlabel('Stops per Line')
axs[0, 0].set_ylabel('Frequency')
axs[0, 0].grid(True)

# Plot Histogram of Trips per Line
axs[0, 1].hist(line_summary['Trips per Line'], bins=20, color='skyblue', edgecolor='black')
axs[0, 1].set_title('Histogram of Trips per Line')
axs[0, 1].set_xlabel('Trips per Line')
axs[0, 1].set_ylabel('Frequency')
axs[0, 1].grid(True)

# Plot Histogram of Unique Riders
axs[1, 0].hist(line_summary['Unique Riders'], bins=20, color='lightgreen', edgecolor='black')
axs[1, 0].set_title('Histogram of Unique Riders per Line')
axs[1, 0].set_xlabel('Unique Riders per Line')
axs[1, 0].set_ylabel('Frequency')
axs[1, 0].grid(True)

# Plot Histogram of Average Questions per Trip
axs[1, 1].hist(line_summary['Average Questions per Trip'], bins=20, color='salmon', edgecolor='black')
axs[1, 1].set_title('Histogram of Average Questions per Trip')
axs[1, 1].set_xlabel('Average Questions per Trip')
axs[1, 1].set_ylabel('Frequency')
axs[1, 1].grid(True)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plots
plt.show()

# Filter data to non-missing rows for mapping, then we want one dot per stop, so group by stop and take the first row
map_df = df_filtered.dropna(subset=['stop_id']).drop_duplicates(subset='stop_id', keep='first')

# Create the base map centered around Vancouver
vancouver_map = folium.Map(location=[49.25, -123.1], zoom_start=11)

# Prepare data for the heatmap
heat_data = [[row['start_stop_lat'], row['start_stop_lng']] for index, row in map_df.iterrows()]

# Add heatmap layer
HeatMap(heat_data, radius=12, blur=15, max_zoom=13).add_to(vancouver_map)

# Display the map
vancouver_map

# pivot the data for cleaner averaging
id_columns = ['user_trip_id', 'route_short_name', 'stop_id', 'start_stop_lat', 'start_stop_lng']

# drop duplicates to keep one row per user_trip_id
df_unique = df_filtered[id_columns].drop_duplicates()

# pivot table to make question_ids into columns
df_wide = df_filtered.pivot_table(index='user_trip_id', columns='question_id', values='SCORE')

# reset the index
df_wide = df_wide.reset_index()

# merge back with unique data
df_dash = df_unique.merge(df_wide, on = 'user_trip_id', how = 'left')

# Group by route_short_name and calculate mean for each variable
df_byline = df_dash.groupby('route_short_name')[question_ids].mean()
df_bystop = df_dash.groupby('stop_id')[question_ids].mean()

# Compute the overall average of these means
df_byline['overall_average'] = df_byline.mean(axis=1)
df_bystop['overall_average'] = df_bystop.mean(axis=1)

# Sort by overall average in ascending order (worst at top)
df_byline = df_byline.sort_values(by='overall_average', ascending=True)
df_bystop = df_bystop.sort_values(by='overall_average', ascending=True)

# Create a panel of subplots (2 rows, 3 columns for questions)
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Flatten the axes array to make indexing easier
axes = axes.flatten()

# Set the common x-axis range
x_min, x_max = 1.0, 5.0

# Plot each question histogram
for i, question in enumerate(question_ids):
    # Extract the question part after 'stop_' and replace underscores with spaces for readability
    question_title = question.split('stop_')[-1].replace('_', ' ').title()
    
    axes[i].hist(df_byline[question].dropna(), bins=20, color='lightblue', edgecolor='black')
    axes[i].set_title(f'{question_title}')
    axes[i].set_xlabel('Score')
    axes[i].set_ylabel('Frequency')
    axes[i].set_xlim(x_min, x_max)  # Ensure all plots have the same x-axis range

# Turn off the unused axes (in case of extra space in the 2x3 grid)
for j in range(len(question_ids), len(axes)):
    axes[j].axis('off')

# Adjust layout to prevent overlapping and ensure proper spacing
plt.tight_layout()

# Show the question histograms
plt.show()

# Plot the overall average histogram in a new figure
plt.figure(figsize=(10, 6))
plt.hist(df_byline['overall_average'].dropna(), bins=20, color='salmon', edgecolor='black')
plt.title('Overall Average Score')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.xlim(x_min, x_max)
plt.grid(True)
plt.show()

# Select only the worst 20 lines
df_top_n = df_byline[['overall_average']].head(20)

# Plotting
plt.figure(figsize=(10, 8))
df_top_n.plot(kind='barh', legend=False, color='skyblue', figsize=(10, 8))

# Titles and labels
plt.title('Top 20 Worst Routes by Overall Average Score')
plt.xlabel('Overall Average Score')
plt.ylabel('Route')

# Show plot
plt.gca().invert_yaxis() # worst scores appear at the top
plt.tight_layout()
plt.show()

<Figure size 1000x800 with 0 Axes>

# Step 1: Filter df_filtered to one row per stop_id (taking the first occurrence)
df_filtered_unique = df_filtered.drop_duplicates(subset=['stop_id'], keep='first')

# Step 2: Merge the filtered df with the main df based on stop_id
df_merged = pd.merge(df_bystop.reset_index(), df_filtered_unique[['stop_id', 'start_stop_lat', 'start_stop_lng']], on='stop_id', how='left')

# Step 3: Clean the merged dataframe by dropping rows where necessary columns are NaN
df_cleaned = df_merged.dropna(subset=['start_stop_lat', 'start_stop_lng', 'overall_average'])

# Create a base map centered around the average latitude and longitude of the stops
map_center = [df_cleaned['start_stop_lat'].mean(), df_cleaned['start_stop_lng'].mean()]
m = folium.Map(location=map_center, zoom_start=11)

# Maximum value for overall_average to normalize color scale
max_overall = df_cleaned['overall_average'].max()

# Iterate through the dataframe and add markers to the map
for idx, row in df_cleaned.iterrows():
    # Calculate a base color using overall_average (colors: lower=red, higher=blue)
    base_color = plt.cm.coolwarm_r(row['overall_average'] / max_overall)
    rgb_color = f"rgb({int(base_color[0] * 255)},{int(base_color[1] * 255)},{int(base_color[2] * 255)})"

    # Create the tooltip text, which will show additional variables
    tooltip_text = f"<b>Stop ID:</b> {row['stop_id']}<br>"
    for col in question_ids:
        tooltip_text += f"<b>{col.replace('_', ' ').title()}:</b> {row[col]}<br>"

    # Create a CircleMarker with the RGB values based on the overall_average
    folium.CircleMarker(
        location=[row['start_stop_lat'], row['start_stop_lng']],
        radius=8,
        color=rgb_color,
        fill=True,
        fill_color=rgb_color,
        fill_opacity=0.6,
        popup=f"<b>Overall Average:</b> {row['overall_average']}",
        tooltip=tooltip_text  # show on hover
    ).add_to(m)

# Display the map
m

# Prepare data for graphing by neighbourhood and create function
# Load the shapefile
neighbourhoods = gpd.read_file('local-area-boundary/local-area-boundary.shp')

# Convert df_dash to GeoDataFrame
geometry = [Point(xy) for xy in zip(df_dash['start_stop_lng'], df_dash['start_stop_lat'])]
df_dash_geo = gpd.GeoDataFrame(df_dash, geometry=geometry, crs='EPSG:4326')

# Ensure both dataframes have the same coordinate reference system (CRS)
neighbourhoods = neighbourhoods.to_crs(df_dash_geo.crs)

# Perform spatial join to get neighbourhoods for each stop
df_dash_geo = gpd.sjoin(df_dash_geo, neighbourhoods, how='left', predicate='within')

# function for graphing
def neighbourhood_graphs(column_name):
    # Aggregate the selected column per neighbourhood
    column_avg = df_dash_geo.groupby('name')[column_name].mean().reset_index()

    # Merge with neighbourhood geometries
    neighbourhoods_merged = neighbourhoods.merge(column_avg, left_on='name', right_on='name', how='left')

    # Interactive Map with Folium
    m = folium.Map(location=[49.2827, -123.1207], zoom_start=11)  # Centered on Vancouver

    # Add neighbourhoods as a choropleth map with blue color scale
    folium.Choropleth(
        geo_data=neighbourhoods_merged,
        name=f'{column_name} Density',
        data=neighbourhoods_merged,
        columns=['name', column_name],
        key_on='feature.properties.name',
        fill_color='Blues',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name=f"Average {column_name.replace('_', ' ').title()} Reports"
    ).add_to(m)

    # Add a title using HTML
    title_html = f"""
    <div style='position: fixed; 
                top: 10px; left: 50px; width: 50%;
                font-size: 18px; font-weight: bold;
                background-color: white; padding: 10px;
                border-radius: 5px; z-index:9999'>
         Average {column_name.replace('_', ' ').title()} Reports per Neighbourhood in Vancouver
    </div>
    """
    m.get_root().html.add_child(Element(title_html))

    return m

neighbourhood_graphs('stop_cleanliness')

neighbourhood_graphs('stop_amenities_graffiti')

neighbourhood_graphs('stop_amenities_broken_glass')

neighbourhood_graphs('stop_amenities_trash')

neighbourhood_graphs('stop_amenities_trash_can')

neighbourhood_graphs('stop_amenities_trash_can_fill_level')

	VARIABLE	N	N_MISSING	PERCENT_MISSING
0	answer_id	585649	0	0.00
1	datetime_local_timezone	585649	0	0.00
2	hashed_user_id	585649	0	0.00
3	feed_name	585649	0	0.00
4	route_short_name	585649	0	0.00
5	route_long_name	585649	0	0.00
6	question_id	585649	0	0.00
7	question_title	585649	0	0.00
8	question_subtitle	585649	544719	93.01
9	answer_title	585649	0	0.00
10	followed_up_from_question_title	585649	416511	71.12
11	followed_up_from_answer_title	585649	416511	71.12
12	user_trip_id	585649	7	0.00
13	trip_id	585649	30880	5.27
14	stop_id	585649	1208	0.21
15	stop_name	585649	1191	0.20
16	stop_code	585649	1208	0.21
17	start_stop_id	585649	1208	0.21
18	start_stop_lat	585649	1163	0.20
19	start_stop_lng	585649	1163	0.20
20	start_stop_name	585649	1170	0.20
21	start_stop_code	585649	1208	0.21
22	end_stop_id	585649	1208	0.21
23	end_stop_lat	585649	1163	0.20
24	end_stop_lng	585649	1163	0.20
25	end_stop_name	585649	1170	0.20
26	end_stop_code	585649	1208	0.21
27	vehicle_id	585649	349731	59.72
28	SCORE	585649	448532	76.59

	question_title	N_TOTAL	ANSWER
2	How clean is this [vehicle_name] stop?	20135	1.0: ███░░░░░░░ (4286) 3.0: ██████████ (12550) 5.0: ███░░░░░░░ (3299)
1	Does this [vehicle_name] stop have a trash can?	10474	1.0: █████░░░░░ (2870) 3.0: ███░░░░░░░ (1760) 5.0: ██████████ (5844)
3	How full is the trash can?	5576	1.0: █░░░░░░░░░ (292) 3.0: ██████████ (3092) 5.0: ███████░░░ (2192)
5	Is there trash or refuse at this [vehicle_name] stop?	3934	1.0: ████████░░ (1570) 3.0: ██████████ (2044) 5.0: ██░░░░░░░░ (320)
0	Are there graffiti, tags or etchings at this [vehicle_name] stop?	3724	1.0: ██████████ (2504) 3.0: █░░░░░░░░░ (166) 5.0: ████░░░░░░ (1054)
4	Is there broken glass at this [vehicle_name] stop?	3587	1.0: █░░░░░░░░░ (334) 3.0: ██░░░░░░░░ (561) 5.0: ██████████ (2692)

	Line	Stops per Line	Trips per Line	Unique Riders	Average Questions per Trip
0	Expo	42.00	702.00	512.00	1.78
1	25	115.00	645.00	457.00	1.87
2	49	100.00	608.00	385.00	1.83
3	99	29.00	544.00	393.00	1.83
4	R4	31.00	515.00	348.00	1.95
...	...	...	...	...	...
225	231	1.00	1.00	1.00	2.00
226	855	1.00	1.00	1.00	3.00
227	N15	1.00	1.00	1.00	1.00
228	OVERALL MEDIAN	22.00	64.00	48.50	1.99
229	OVERALL AVERAGE	27.91	103.58	74.91	1.99

👋 Document Overview¶

🎯 Key Findings¶

✏️ Notes¶

⚙️ Part 1: Data Preprocessing¶

🔎 Part 2: Descriptives¶

📊 Part 3: Dashboard Example Plots¶

🕵️ Part 4: Data Validity Reflection¶