11.2 Similarity search or twin areas

11.9

create table tree_similarity as 

-- Finds the count of all trees in each neighborhood
with a as (
    select
        count(t.*) as total_trees,
        n.neighborhood
    from
        nyc_2015_tree_census t
        join nyc_neighborhoods n on st_intersects(t.geom, n.geom)
    group by
        n.neighborhood
)

-- Finds the count of each type of tree in each neighborhood
select
    n.neighborhood,
    (
        count(t.*) filter (
            where
                t.spc_common ilike '%pine%'
        ) :: numeric / a.total_trees
    ) as pine,
    (
        count(t.*) filter (
            where
                t.spc_common ilike '%maple%'
        ) :: numeric / a.total_trees
    ) as maple,
    (
        count(t.*) filter (
            where
                t.spc_common ilike '%linden%'
        ) :: numeric / a.total_trees
    ) as linden,
    (
        count(t.*) filter (
            where
                t.spc_common ilike '%honeylocust%'
        ) :: numeric / a.total_trees
    ) as honeylocust,
    (
        count(t.*) filter (
            where
                t.spc_common ilike '%oak%'
        ) :: numeric / a.total_trees
    ) as oak

-- Joins the above data with data from the original CTE
from
    nyc_2015_tree_census t
    join nyc_neighborhoods n on st_intersects(t.geom, n.geom)
    join a using (neighborhood)
group by
    n.neighborhood,
    a.total_trees

11.10

with a as (
    select
        *
    from
        tree_similarity
    where
        neighborhood = 'Harlem'
)
select
    t.neighborhood,

    -- Here we subtract the values from our target neighborhood in table "t"
    -- which is Harlem, from the average values across the city
    t.pine - a.pine as pine_dif,
    t.maple - a.maple as maple_dif,
    t.oak - a.oak as oak_dif,
    t.linden - a.linden as linden_dif,
    t.honeylocust - a.honeylocust as honeylocust_dif
from
    tree_similarity t,
    a
where
    t.neighborhood != 'Harlem'

11.11

create table harlem_tree_similarity as with a as (
    select
        *
    from
        tree_similarity
    where
        neighborhood = 'Harlem'
),

-- A: Find the difference between Harlem and all other neighborhoods 
b as (
    select
        t.neighborhood,
        t.pine - a.pine as pine_dif,
        t.maple - a.maple as maple_dif,
        t.oak - a.oak as oak_dif,
        t.linden - a.linden as linden_dif,
        t.honeylocust - a.honeylocust as honeylocust_dif
    from
        tree_similarity t,
        a
    where
        t.neighborhood != 'Harlem'
),

-- B: Find the min and max values in each column and store it as an array 
c as (
    select
        array [min(pine_dif), max(pine_dif)] as pine,
        array [min(maple_dif), max(maple_dif)] as maple,
        array [min(oak_dif), max(oak_dif)] as oak,
        array [min(linden_dif), max(linden_dif)] as linden,
        array [min(honeylocust_dif), max(honeylocust_dif)] as honeylocust
    from
        b
),

-- C: Find the absolute value of each difference value, normalize the data, and subtract that value from 1 
d as (
    select
        b.neighborhood,
        1 - (abs(b.pine_dif) - c.pine [1]) / (c.pine [2] - c.pine [1]) as pine_norm,
        1 - (b.maple_dif - c.maple [1]) / (c.maple [2] - c.maple [1]) as maple_norm,
        1 - (b.oak_dif - c.oak [1]) / (c.oak [2] - c.oak [1]) as oak_norm,
        1 - (b.linden_dif - c.linden [1]) / (c.linden [2] - c.linden [1]) as linden_norm,
        1 - (b.honeylocust_dif - c.honeylocust [1]) / (c.honeylocust [2] - c.honeylocust [1]) as honeylocust_norm
    from
        b,
        c
) 

-- D: Add up and divide the values 
select
    neighborhood,
    (
        pine_norm + maple_norm + oak_norm + linden_norm + honeylocust_norm
    ) / 5 as final
from
    d
order by
    2 desc

11.12

create table harlem_tree_similarity_geo as
select
    s.*,
    h.geom
from
    harlem_tree_similarity s
    join nyc_hoods h using (neighborhood)