import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import folium
from shapely.geometry import Polygon, Point, MultiPolygon
from shapely import wkt
%matplotlib inline
Population estimates on uniform geometries
4 June 2019
ABS releases population figures at various levels: SA1, SA2,… and also at meshblock scale. This notebook contains code for obtaining estimates of populations on other geometries, in particular uniform hex and square grids.
We do this using the known populations from the ABS 2016 Census at the meshblock level and then perform a weighted average over these with each of the cells in the new geometry. The result is a more granular estimate of the population than at SA2 level. This can be useful when working with data sources that are defined on uniform geometries rather than standard ABS geometries.
References: - https://www.abs.gov.au/ausstats/[email protected]/Lookup/by%20Subject/1270.0.55.001July%202016Main%20FeaturesMesh%20Blocks%20(MB)10012
https://www.abs.gov.au/ausstats/[email protected]/mf/2074.0
https://www.abs.gov.au/websitedbs/D3310114.nsf/home/Australian+Statistical+Geography+Standard+(ASGS)
1.0. Setup
1.1. Population transformation function
This function carriers out the estimates of the populations on the new geometry. It takes two shapefiles:
- gdf1: the meshblock geometry, population figures and areas of the meshblocks
- gdf2: the shapefile for the new geometry
The approach is as follows. For each of the cells in the new geometry: - Calculate the meshblocks that is intersects with and the corresponding areas. - Compute the fraction of overlap there is with each of these meshblocks based on the area. - Estimate the population by taking a weighted sum of the populations of the intersecting meshblocks and the corresponding overlap fractions
def parameter_estimate(gdf1, gdf2, area_field, param_field, label2_field):
# calculate the overlay
= gpd.overlay(gdf1, gdf2, how='intersection')
gdf_overlay
# calculate the area of each of the new polygons
'area_new'] = gdf_overlay['geometry'].apply(lambda p: p.area)
gdf_overlay[
# calculate the fraction of the new polygons intersecting with the old ones
'fraction_overlap'] = gdf_overlay['area_new'] / gdf_overlay[area_field]
gdf_overlay[
# estimate the value of the parameter for each of the new polygons in their overlaps
+ '_new'] = gdf_overlay['fraction_overlap'] * gdf_overlay[param_field]
gdf_overlay[param_field
# calculate the weighted sum to estimate the new value for each of the new polygons
= gdf_overlay.groupby(label2_field)[param_field + '_new'].sum().reset_index()
df_ws
return df_ws
1.2. Define the CRS
- crs_abs is the lat, lng coord system
- crs_vic is a projected CRS
# define a CRS
= {'init': 'epsg:4283'}
crs_abs
# the projected CRS
= {'init': 'epsg:28355'} crs_vic
2.0. Load up datasets
We use following datasets: - Victorian meshblocks which contain the counts of people and dwellings - Victorian meshblock shapefiles containing the geometry - Regular 1km x 1km square shapefile - Regular hexagon shapefile
2.1. Victorian meshblocks with person and dwelling counts
# load up the Vic meshblock codes and corresponding dwelling and person counts
= pd.read_csv('data/refined/MB_VIC_2016.csv') df_mb_vic
# let's have a look at this
5) df_mb_vic.head(
MB_CODE_2016 | MB_CATEGORY_NAME_2016_y | SA1_MAINCODE_2016 | SA2_NAME_2016 | SA3_NAME_2016 | SA4_NAME_2016 | GCCSA_NAME_2016 | Dwelling | Person | |
---|---|---|---|---|---|---|---|---|---|
0 | 20000009499 | NOUSUALRESIDENCE | 29999949999 | No usual address (Vic.) | No usual address (Vic.) | No usual address (Vic.) | No usual address (Vic.) | NaN | 7565.0 |
1 | 20000010000 | Education | 20403106914 | Bright - Mount Beauty | Wodonga - Alpine | Hume | Rest of Vic. | 0.0 | 0.0 |
2 | 20000021000 | Commercial | 20403106902 | Bright - Mount Beauty | Wodonga - Alpine | Hume | Rest of Vic. | 3.0 | 0.0 |
3 | 20000022000 | Commercial | 20403106902 | Bright - Mount Beauty | Wodonga - Alpine | Hume | Rest of Vic. | 3.0 | 3.0 |
4 | 20000023000 | Commercial | 20403106902 | Bright - Mount Beauty | Wodonga - Alpine | Hume | Rest of Vic. | 5.0 | 5.0 |
2.2. Regular 1km x 1km square grid shapefile
= gpd.read_file('data/refined/sqrgrid_aoi_1km.shp')
gdf_1km_sqr 5) gdf_1km_sqr.head(
SquareLabe | geometry | |
---|---|---|
0 | 5777 | POLYGON ((143.6154027552672 -37.98629078606395... |
1 | 6069 | POLYGON ((143.6247063477386 -38.03160383838987... |
2 | 6070 | POLYGON ((143.6251195822346 -38.02260665306308... |
3 | 6071 | POLYGON ((143.6255326327272 -38.01360945011705... |
4 | 6074 | POLYGON ((143.6267706810596 -37.98661773559558... |
2.3. Regular hexagaon shapefile (1km side lengths)
= gpd.read_file('data/refined/hexgrid_aoi_1km.shp')
gdf_1km_hex 5) gdf_1km_hex.head(
HexLabel | geometry | |
---|---|---|
0 | 13733 | POLYGON ((144.3291385892137 -37.62172602425927... |
1 | 14027 | POLYGON ((144.3477842656918 -37.64917983509825... |
2 | 14028 | POLYGON ((144.3384597639403 -37.63545324745296... |
3 | 14029 | POLYGON ((144.3487448672874 -37.62216848943908... |
4 | 14030 | POLYGON ((144.3394232637044 -37.60844204331411... |
2.4. Victorian meshblocks shapefile
# load up the meshblocks shapefiles
= gpd.read_file('data/opendata/Census/1270055001_mb_2016_vic_shape/MB_2016_VIC.shp')
gdf_mb
# rename the column to be consistent with the other data sources
= gdf_mb.rename(columns={'MB_CODE16': 'MB_CODE_2016'})
gdf_mb = gdf_mb[['MB_CODE_2016', 'geometry']]
gdf_mb
# convert the MB_CODE_2016 to a integer
'MB_CODE_2016'] = gdf_mb['MB_CODE_2016'].astype(np.int)
gdf_mb[
5) gdf_mb.head(
MB_CODE_2016 | geometry | |
---|---|---|
0 | 20000009499 | None |
1 | 20000010000 | POLYGON ((147.1425276460001 -36.69220601699993... |
2 | 20000021000 | POLYGON ((146.9593400960001 -36.72780516099994... |
3 | 20000022000 | POLYGON ((146.9613617580001 -36.72686980099996... |
4 | 20000023000 | POLYGON ((146.9610484790001 -36.72822697899994... |
3.0. Wrangling
3.1. Filter to SA2 (Suburb)
# look at just one suburb
#SA2_NAME_2016 = 'Carlton'
#SA3_NAME_2016 = 'Melbourne City'
#df_smaller = df_mb_vic[df_mb_vic['SA3_NAME_2016']==SA3_NAME_2016]
#df_smaller.head(5)
3.2. Filter to Greater Melbourne area
Select ‘Greater Melbourne’ as the GCCSA_NAME_2016 value and the meshblocks in this region.
= 'Greater Melbourne'
GCCSA_NAME_2016 = df_mb_vic[df_mb_vic['GCCSA_NAME_2016']==GCCSA_NAME_2016]
df_aoi 5) df_aoi.head(
MB_CODE_2016 | MB_CATEGORY_NAME_2016_y | SA1_MAINCODE_2016 | SA2_NAME_2016 | SA3_NAME_2016 | SA4_NAME_2016 | GCCSA_NAME_2016 | Dwelling | Person | |
---|---|---|---|---|---|---|---|---|---|
1599 | 20015400000 | Residential | 20901119918 | Heidelberg West | Banyule | Melbourne - North East | Greater Melbourne | 33.0 | 53.0 |
1600 | 20015410000 | Residential | 20901120346 | Viewbank - Yallambie | Banyule | Melbourne - North East | Greater Melbourne | 32.0 | 87.0 |
1601 | 20015420000 | Residential | 20901120316 | Viewbank - Yallambie | Banyule | Melbourne - North East | Greater Melbourne | 63.0 | 178.0 |
1602 | 20015430000 | Residential | 20901120318 | Viewbank - Yallambie | Banyule | Melbourne - North East | Greater Melbourne | 47.0 | 115.0 |
1603 | 20015440000 | Residential | 20901120012 | Ivanhoe | Banyule | Melbourne - North East | Greater Melbourne | 33.0 | 68.0 |
Check that the population we’re getting is reasonable
'Person'].sum() df_aoi[
4484394.0
3.3. Join the meshblock shapefiles with the mesh block population data
This is done for just our Area of Interest
# join the shapefiles with the Vic. meshblock data
= df_aoi.merge(gdf_mb, on='MB_CODE_2016')
gdf_mb_sa2
= ['MB_CODE_2016', 'MB_CATEGORY_NAME_2016_y', 'SA1_MAINCODE_2016', 'SA2_NAME_2016', 'SA3_NAME_2016',
nice_cols 'Dwelling', 'Person']
= gpd.GeoDataFrame(data=gdf_mb_sa2[nice_cols], geometry=gdf_mb_sa2['geometry'])
meshblocks_sa2 = crs_abs meshblocks_sa2.crs
3.4. Intersect with square grid covering the AOI
= gpd.sjoin(gdf_1km_sqr, meshblocks_sa2, op='intersects')
gdf_sa2 = gdf_sa2.dissolve(by='SquareLabe', aggfunc='first') sqr_sa2
3.5. Intersect with hex grid covering the AOI
= gpd.sjoin(gdf_1km_hex, meshblocks_sa2, op='intersects')
gdf_sa2 = gdf_sa2.dissolve(by='HexLabel', aggfunc='first') hex_sa2
4.0. Population estimates
4.1. Change CRS
Because I don’t know how to do area calculations correctly in the lat, lng CRS
= meshblocks_sa2.to_crs(crs_vic)
meshblocks_sa2_new = sqr_sa2.to_crs(crs_vic) # convert the squares
sqr_sa2_new = hex_sa2.to_crs(crs_vic) # convert the hexagons hex_sa2_new
5) meshblocks_sa2_new.head(
MB_CODE_2016 | MB_CATEGORY_NAME_2016_y | SA1_MAINCODE_2016 | SA2_NAME_2016 | SA3_NAME_2016 | Dwelling | Person | geometry | |
---|---|---|---|---|---|---|---|---|
0 | 20015400000 | Residential | 20901119918 | Heidelberg West | Banyule | 33.0 | 53.0 | POLYGON ((328410.1439992741 5819338.914122874,... |
1 | 20015410000 | Residential | 20901120346 | Viewbank - Yallambie | Banyule | 32.0 | 87.0 | POLYGON ((332284.1471433818 5821512.586353699,... |
2 | 20015420000 | Residential | 20901120316 | Viewbank - Yallambie | Banyule | 63.0 | 178.0 | POLYGON ((332245.6594393802 5822051.032302149,... |
3 | 20015430000 | Residential | 20901120318 | Viewbank - Yallambie | Banyule | 47.0 | 115.0 | POLYGON ((332663.5247888024 5822260.237469032,... |
4 | 20015440000 | Residential | 20901120012 | Ivanhoe | Banyule | 33.0 | 68.0 | POLYGON ((326746.5876069277 5818877.766105011,... |
4.2. Filter data
We only need a few of the columns from the above datasets.
= sqr_sa2_new[['MB_CODE_2016', 'geometry']].reset_index()
squares 5) squares.head(
SquareLabe | MB_CODE_2016 | geometry | |
---|---|---|---|
0 | 23934 | 20631929990 | POLYGON ((263743.8136395717 5830298.752182018,... |
1 | 24231 | 20631929990 | POLYGON ((264743.8136395747 5830298.752182018,... |
2 | 24232 | 20631929990 | POLYGON ((264743.8136395735 5831298.752182018,... |
3 | 24233 | 20631929990 | POLYGON ((264743.8136395724 5832298.752182018,... |
4 | 24525 | 20631929990 | POLYGON ((265743.8136395731 5827298.752182018,... |
= hex_sa2_new[['MB_CODE_2016', 'geometry']].reset_index()
hexes 5) hexes.head(
HexLabel | MB_CODE_2016 | geometry | |
---|---|---|---|
0 | 13733 | 20631929990 | POLYGON ((264284.1761915246 5832798.752182018,... |
1 | 14027 | 20631929990 | POLYGON ((266016.2269990934 5829798.752182018,... |
2 | 14028 | 20631929990 | POLYGON ((265150.201595313 5831298.752182018, ... |
3 | 14029 | 20631929990 | POLYGON ((266016.226999094 5832798.752182018, ... |
4 | 14030 | 20631929990 | POLYGON ((265150.2015953095 5834298.752182016,... |
From the meshblock data we need the meshblock code, meshblock polygons and the population within each meshblock
= meshblocks_sa2_new[['MB_CODE_2016', 'Person', 'geometry']]
mesh
# calculate the area of each of the meshblocks in the projected CRS
'area'] = mesh['geometry'].apply(lambda p: p.area) mesh[
/Users/vic2e3a/.conda/envs/uberenv/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
after removing the cwd from sys.path.
5) mesh.head(
MB_CODE_2016 | Person | geometry | area | |
---|---|---|---|---|
0 | 20015400000 | 53.0 | POLYGON ((328410.1439992741 5819338.914122874,... | 12500.606939 |
1 | 20015410000 | 87.0 | POLYGON ((332284.1471433818 5821512.586353699,... | 27133.446933 |
2 | 20015420000 | 178.0 | POLYGON ((332245.6594393802 5822051.032302149,... | 44286.746329 |
3 | 20015430000 | 115.0 | POLYGON ((332663.5247888024 5822260.237469032,... | 45616.607446 |
4 | 20015440000 | 68.0 | POLYGON ((326746.5876069277 5818877.766105011,... | 24518.744626 |
4.3. Calculate the population estimates
Given the meshblock grid on which we have population data we want to estimate how many people are in each of our regular grid cells.
= np.round(parameter_estimate(mesh, squares, 'area', 'Person', 'SquareLabe')) square_data
5) square_data.head(
SquareLabe | Person_new | |
---|---|---|
0 | 23934 | 0.0 |
1 | 24231 | 4.0 |
2 | 24232 | 4.0 |
3 | 24233 | 2.0 |
4 | 24525 | 1.0 |
= np.round(parameter_estimate(mesh, hexes, 'area', 'Person', 'HexLabel')) hex_data
5) hex_data.head(
HexLabel | Person_new | |
---|---|---|
0 | 13733 | 1.0 |
1 | 14027 | 5.0 |
2 | 14028 | 3.0 |
3 | 14029 | 10.0 |
4 | 14030 | 2.0 |
4.4. Output the new estimates
Note that there is a population estimate associated with each of the polygons. These are labelled by ‘SquareLabe’ (for the square geometry) and ‘HexLabel’ for the hexagon geometry.
'data/refined/population_sqr_1km.csv', index=False) square_data.to_csv(
'data/refined/population_hex_1km.csv', index=False) hex_data.to_csv(
5) square_data.head(
SquareLabe | Person_new | |
---|---|---|
0 | 23934 | 0.0 |
1 | 24231 | 4.0 |
2 | 24232 | 4.0 |
3 | 24233 | 2.0 |
4 | 24525 | 1.0 |
5) hex_data.head(
HexLabel | Person_new | |
---|---|---|
0 | 13733 | 1.0 |
1 | 14027 | 5.0 |
2 | 14028 | 3.0 |
3 | 14029 | 10.0 |
4 | 14030 | 2.0 |
'Person_new'].sum() square_data[
4484488.0
'Person_new'].sum() hex_data[
4484421.0