Which Tree Species Should The City Plant?¶

Analysis by Jason Muteham

Trees Photo by Andy on Unsplash


📖 Background¶

You work for a nonprofit organization advising the planning department on ways to improve the quantity and quality of trees in New York City. The urban design team believes tree size (using trunk diameter as a proxy for size) and health are the most desirable characteristics of city trees.

The city would like to learn more about which tree species are the best choice to plant on the streets of Manhattan.

💾 The data¶

The team has provided access to the 2015 tree census and geographical information on New York City neighborhoods (trees, neighborhoods):

Tree census and neighborhood information from the City of New York NYC Open Data.

💪 Challenge¶

Create a report that covers the following:

  • What are the most common tree species in Manhattan?
  • Which are the neighborhoods with the most trees?
  • A visualization of Manhattan's neighborhoods and tree locations.
  • What ten tree species would you recommend the city plant in the future?
In [ ]:
import pandas as pd
import geopandas as gpd
import numpy as np
import plotly.express as px
import plotly.io as pio 
import plotly.figure_factory as ff
import plotly.graph_objects as go
pio.renderers.default = "vscode+notebook"
#Import the data
trees = pd.read_csv('data/trees.csv')
neighborhoods = gpd.read_file('data/nta.shp')
In [ ]:
#There are rows of dead trees missing spc_common, dropping them.
trees = trees.dropna() 
In [ ]:
#Change column names to informative descriptions for charts.
trees.columns = trees.columns.str.replace("spc_common","Name")
trees.columns = trees.columns.str.replace("nta_name","Neighborhood")
trees.columns = trees.columns.str.replace("tree_dbh","Tree Size")
neighborhoods.columns = neighborhoods.columns.str.replace("ntaname","Neighborhood")

#Capitalise all tree names, so they look nice. 
trees['Name'] = trees['Name'].str.capitalize()

#Setup default parameters
px.defaults.color_continuous_scale='algae'
px.defaults.height = 670
cp=dict(lat=40.79, lon=-73.96)
init_zoom = 10.6
map = "carto-positron"
mg = dict(l=20, r=20, b=20, t=100)
total_trees = trees['Name'].count()

Q. What are the most common tree species in Manhattan?¶

In [ ]:
trees['Name'].value_counts().head(10)
Out[ ]:
Honeylocust          13175
Callery pear          7297
Ginkgo                5859
Pin oak               4584
Sophora               4453
London planetree      4122
Japanese zelkova      3596
Littleleaf linden     3333
American elm          1698
American linden       1583
Name: Name, dtype: int64
In [ ]:
fig = px.bar(trees.groupby(['Name'])['Name'].agg(['count']).sort_values(by='count', ascending=False).head(10), title='Top 10 tree species in Manhattan', y='count', color='count', height=500, text_auto=True)
fig.update_layout(yaxis_title='Number of trees', xaxis_title='', showlegend=False)
fig.update_coloraxes(colorbar_ticklabelposition='inside',colorbar_ticks='inside',showscale=False)
fig.update_traces(hovertemplate='%{x}, %{y} trees',marker_line_color='darkgreen', marker_line_width=1.0)
fig.show()

A. The most popular tree species in Manhattan is the Honeylocust.


Q. Which are the neighborhoods with the most trees?¶

In [ ]:
#Create dataframe with tree counts, density + geo data for plotting 
nhood_m = neighborhoods.to_crs(epsg=6933) # we want metres to calculate trees per sqKm
areas = nhood_m.area
neighborhoods["area_sqKm"] = areas.values/10e5
grp_trees = trees.groupby('nta')['Neighborhood'].agg(['count'])
trees_geo = neighborhoods.merge(grp_trees, left_on=['ntacode'], right_on=['nta'])
trees_geo['Density'] =  trees_geo['count'] // trees_geo['area_sqKm']
trees_geo['Tree %'] = ((trees_geo['count'] * 10000) // total_trees)/100
In [ ]:
df_plot = trees_geo.sort_values(by='count', ascending=True)[['Neighborhood','count']].set_index('Neighborhood')
fig = px.bar(df_plot.tail(10), x='count', title='Top 10 leafy neighborhoods by number of trees',
    color='count', range_x=(500,6000), height=500, text_auto=True)
fig.update_layout(yaxis_title='', xaxis_title='Trees')
fig.update(layout_coloraxis_showscale=False)
fig.update_traces(hovertemplate='%{x} trees', marker_line_color='darkgreen', marker_line_width=1.0) 
fig.show()
In [ ]:
df_plot = trees_geo.set_index('Neighborhood')
fig = px.choropleth_mapbox(df_plot, geojson=eval(df_plot['geometry'].to_json()), locations=df_plot.index, center=cp, mapbox_style=map,
    zoom=init_zoom, color='count', opacity=0.8, title='Which neighborhood has the most trees?')
fig.update_coloraxes(colorbar_title='Trees')  
fig.update_traces(hovertemplate='%{location}, %{z} trees') 
fig.update_layout(margin=mg)                         
fig.show()
In [ ]:
df_plot=trees_geo.sort_values(by='Density', ascending=True)[['Neighborhood','Density']].set_index('Neighborhood')
fig = px.bar(df_plot.tail(10), x='Density', title='Top 10 leafy neighborhoods by density of trees',color='Density',height=500, text_auto=True)
fig.update_layout(yaxis_title='', xaxis_title='Trees per km²')
fig.update(layout_coloraxis_showscale=False)
fig.update_traces(hovertemplate='%{x} trees per km²',marker_line_color='darkgreen', marker_line_width=1.0) 
fig.show()
In [ ]:
df_plot = trees_geo.set_index('Neighborhood')
fig = px.choropleth_mapbox(df_plot, geojson=eval(df_plot['geometry'].to_json()), locations=df_plot.index, center=cp, mapbox_style=map, zoom=init_zoom, color='Density', opacity=0.8, 
    title='Which neighborhood has the highest density of trees?')
fig.update_coloraxes(colorbar_title='Trees per km²')  
fig.update_traces(hovertemplate='%{location}, %{z} trees per km²') 
fig.update_layout(margin=mg)            
fig.show()

Top 5 neighborhoods as a percentage of trees

In [ ]:
trees_geo[['Neighborhood','Tree %']].sort_values(by='Tree %',ascending=False).set_index('Neighborhood').head(5)
Out[ ]:
Tree %
Neighborhood
Upper West Side 9.16
Upper East Side-Carnegie Hill 7.27
West Village 5.95
Central Harlem North-Polo Grounds 5.37
Hudson Yards-Chelsea-Flatiron-Union Square 4.48

A. The neighborhood of Upper West Side has the greatest number of trees with a total count of 5723 which is 9.16% of Manhattan trees.

However, there are two neighborhoods with a greater density of trees, being Upper East Side-Carnegie Hill and Central Harlem South.


Visualization of Manhattan's tree locations.¶

In [ ]:
fig = ff.create_hexbin_mapbox(data_frame=trees, lat="latitude", lon="longitude", nx_hexagon=40, opacity=0.7, labels={"color": "Trees"},
    center=cp, min_count=1, color_continuous_scale='algae', zoom=init_zoom, title='Where are all the trees in Manhattan?',
    mapbox_style=map, agg_func=np.sum, show_original_data=False, original_data_marker=dict(size=1, opacity=0.5, color="yellowgreen"))
fig.update_coloraxes(colorbar_ticklabelposition='inside')
fig.update_layout(margin=mg)
fig.show()

Q. What ten tree species would you recommend the city plant in the future?¶

The urban design team believes tree size (using trunk diameter as a proxy for size) and health are the most desirable characteristics of city trees.

In [ ]:
# Create DataFrame by name with median tree size to avoid influence of outliers 
tree_size = trees.groupby(['Name'])['Tree Size'].agg(['median'])
tree_size = tree_size.sort_values(by='median',ascending=False)
tree_size = tree_size.reset_index()
tree_size.columns = ['Name','Median Size']
In [ ]:
# Create DataFrame by name and tree health
tree_health = trees.groupby(['Name','health'])['Name'].agg('count')
tree_health = tree_health.unstack()
tree_health = tree_health.fillna(0)
tree_health['Total'] = tree_health.sum(axis='columns')
tree_health['Good %'] = (tree_health['Good'] * 100) // tree_health['Total'] 
tree_health['Fair %'] = (tree_health['Fair'] * 100) // tree_health['Total']
tree_health['Poor %'] = (tree_health['Poor'] * 100) // tree_health['Total']
tree_health = tree_health.reset_index()
In [ ]:
# Merge the tree size & health DataFrames
tree_size_health = pd.merge(tree_size, tree_health, on='Name')
In [ ]:
#skip lone trees
tree_selection = tree_size_health[tree_size_health['Total']>=2][['Name','Median Size','Good %','Fair %','Poor %','Total']]
tree_selection = tree_selection.sort_values(by=['Median Size','Good %'], ascending=[False,False])
fig = px.scatter(tree_selection,x='Median Size', y='Good %', title='Trees species by health and trunk size', hover_name="Name", 
    hover_data=["Median Size", "Good %",'Fair %','Poor %','Total'])
fig.update_layout(yaxis_title='Good health %', xaxis_title='Median trunk size (inches)')
fig.add_annotation(x=11, y=98, text="Tree Species Recommendations", showarrow=False, yshift=10)
fig.update_traces(marker_size=8, marker_color='darkgreen')
fig.add_shape(type="rect", x0=7.8, y0=75, x1=14.5, y1=102, fillcolor="green", opacity=0.3)
fig.show()

A. The following 10 tree species are recommended for future planting in Manhattan.


In [ ]:
tree_selection = tree_selection[tree_selection['Good %'] >= 75]
tree_selection = tree_selection.set_index('Name')
tree_selection.columns = ['Median trunk size', 'Good health %','Fair health %','Poor health %','Current population']
tree_selection.head(10)
Out[ ]:
Median trunk size Good health % Fair health % Poor health % Current population
Name
American elm 12.0 80.0 15.0 4.0 1698.0
Black maple 11.0 90.0 10.0 0.0 10.0
Amur cork tree 11.0 87.0 12.0 0.0 8.0
Siberian elm 11.0 83.0 11.0 4.0 156.0
Tree of heaven 11.0 78.0 16.0 4.0 104.0
Willow oak 10.0 84.0 12.0 3.0 889.0
Black locust 10.0 77.0 20.0 2.0 259.0
White ash 9.5 80.0 16.0 4.0 50.0
Honeylocust 9.0 83.0 15.0 1.0 13175.0
Pin oak 9.0 81.0 15.0 2.0 4584.0