from lxml import html
import requests
import pandas as pd
Webscraping tutorial
16 January 2020
This notebook contains all the code from the blog post “Intro to webscraping” from 1 January 2021
First example: Create a dataset of lonely dogs
In this example we scrape Pet Rescue (https://www.petrescue.com.au) to create a dataset of dog names and corresponding locations.
Install the relevant libraries
= 'https://www.petrescue.com.au/listings/search/dogs?page='
url_base
= '//article[@class="cards-listings-preview"]/a/header/h3/text()'
name_path = '//strong[@class="cards-listings-preview__content__section__location"]/text()'
location_path
= []
all_names = []
all_locations
for n in range(1, 50):
print(f'Scraping page: {n}')
= f'{url_base}{n}'
url = requests.get(url)
page = html.fromstring(page.text)
tree = tree.xpath(name_path)
names = tree.xpath(location_path)
locations = locations[1::2]
locations += names
all_names += locations all_locations
= pd.DataFrame(data={'name': all_names, 'location': all_locations})
df 'name'] = df['name'].str.strip()
df['location'] = df['location'].str.strip() df[
5) df.head(
Second example: ATM locations
In this second example we create a dataset of locations of all National Australia Bank ATMs in the country.
import requests
import pandas as pd
= -43.834124, 114.078644
lat_min, lng_min = -10.400824, 154.508331 lat_max, lng_max
= f'https://api.nab.com.au/info/nab/location/locationType/atm+brc/queryType/geo/{lat_min}/{lng_min}/{lat_max}/{lng_max}/1/4000?v=1'
url
= {'Host': 'api.nab.com.au',
headers 'Origin': 'https://www.nab.com.au',
'Referer': 'https://www.nab.com.au/',
'x-nab-key': 'a8469c09-22f8-45c1-a0aa-4178438481ef'}
= requests.get(url=url, headers=headers)
page = page.json() data
= pd.json_normalize(data['locationSearchResponse']['locations'])
df = df[['atm.address1', 'atm.suburb', 'atm.state', 'atm.postcode', 'atm.latitude', 'atm.longitude']].dropna() df
5) df.head(